ROCm
diff --git a/‎RELEASE.md‎
Lines changed: 1 addition & 1 deletion b/‎RELEASE.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎configure.py‎
Lines changed: 2 additions & 0 deletions b/‎configure.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tensorflow/BUILD‎
Lines changed: 16 additions & 16 deletions b/‎tensorflow/BUILD‎
Lines changed: 16 additions & 16 deletions
diff --git a/‎tensorflow/c/c_api_experimental.cc‎
Lines changed: 27 additions & 0 deletions b/‎tensorflow/c/c_api_experimental.cc‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎tensorflow/c/c_api_experimental.h‎
Lines changed: 12 additions & 2 deletions b/‎tensorflow/c/c_api_experimental.h‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc‎
Lines changed: 4 additions & 1 deletion b/‎tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎tensorflow/compiler/jit/kernels/xla_launch_op.cc‎
Lines changed: 3 additions & 2 deletions b/‎tensorflow/compiler/jit/kernels/xla_launch_op.cc‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎tensorflow/compiler/jit/xla_compilation_cache.cc‎
Lines changed: 1 addition & 17 deletions b/‎tensorflow/compiler/jit/xla_compilation_cache.cc‎
Lines changed: 1 addition & 17 deletions
diff --git a/‎tensorflow/compiler/jit/xla_compile_on_demand_op.cc‎
Lines changed: 3 additions & 1 deletion b/‎tensorflow/compiler/jit/xla_compile_on_demand_op.cc‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎tensorflow/compiler/jit/xla_cpu_device.cc‎
Lines changed: 1 addition & 0 deletions b/‎tensorflow/compiler/jit/xla_cpu_device.cc‎
Lines changed: 1 addition & 0 deletions
@@ -6,7 +6,7 @@
 * Update `tf.keras` to the Keras 2.1.6 API.
 * Added [`tf.keras.layers.CuDNNGRU`](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/keras/layers/CuDNNGRU) and [`tf.keras.layers.CuDNNLSTM`](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/keras/layers/CuDNNLSTM) layers. [Try it](https://colab.sandbox.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb?linkId=53292082).
 * Adding support of core [feature columns](https://www.tensorflow.org/get_started/feature_columns) and [losses](https://www.tensorflow.org/api_docs/python/tf/losses) to [gradient boosted trees estimators](https://github.com/tensorflow/models/tree/master/official/boosted_trees).
-* The [python interface](https://tensorflow-dot-devsite.googleplex.com/versions/r1.9/api_docs/python/tf/contrib/lite)
+* The [python interface](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/contrib/lite)
   for the [TFLite Optimizing Converter](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/toco/README.md)
   has been expanded, and the command line interface (AKA: `toco`, `tflite_convert`) is once again
   included in the standard `pip` installation.
 
@@ -835,6 +835,8 @@ def set_tf_cuda_version(environ_cp):
                      '[Default is %s]: ') % (tf_cuda_version, default_cuda_path)
     cuda_toolkit_path = get_from_env_or_user_or_default(
         environ_cp, 'CUDA_TOOLKIT_PATH', ask_cuda_path, default_cuda_path)
+    if is_windows() or is_cygwin():
+      cuda_toolkit_path = cygpath(cuda_toolkit_path)
 
     if is_windows():
       cuda_rt_lib_path = 'lib/x64/cudart.lib'
 
@@ -445,6 +445,22 @@ filegroup(
     data = glob(["docs_src/**/*.md"]),
 )
 
+cc_library(
+    name = "grpc",
+    deps = select({
+        ":linux_s390x": ["@grpc//:grpc_unsecure"],
+        "//conditions:default": ["@grpc"],
+    }),
+)
+
+cc_library(
+    name = "grpc++",
+    deps = select({
+        ":linux_s390x": ["@grpc//:grpc++_unsecure"],
+        "//conditions:default": ["@grpc//:grpc++"],
+    }),
+)
+
 # A shared object which includes registration mechanisms for ops and
 # kernels. Does not include the implementations of any ops or kernels. Instead,
 # the library which loads libtensorflow_framework.so
@@ -594,19 +610,3 @@ py_library(
     visibility = ["//visibility:public"],
     deps = ["//tensorflow/python:no_contrib"],
 )
-
-cc_library(
-    name = "grpc",
-    deps = select({
-        ":linux_s390x": ["@grpc//:grpc_unsecure"],
-        "//conditions:default": ["@grpc"],
-    }),
-)
-
-cc_library(
-    name = "grpc++",
-    deps = select({
-        ":linux_s390x": ["@grpc//:grpc++_unsecure"],
-        "//conditions:default": ["@grpc//:grpc++"],
-    }),
-)
@@ -57,6 +57,33 @@ void TF_EnableXLACompilation(TF_SessionOptions* options, unsigned char enable) {
   }
 }
 
+TF_Buffer* TF_CreateConfig(unsigned char enable_xla_compilation,
+                           unsigned char gpu_memory_allow_growth) {
+  tensorflow::ConfigProto config;
+  auto* optimizer_options =
+      config.mutable_graph_options()->mutable_optimizer_options();
+  if (enable_xla_compilation) {
+    optimizer_options->set_global_jit_level(tensorflow::OptimizerOptions::ON_1);
+
+    // These XLA flags are needed to trigger XLA properly from C (more generally
+    // non-Python) clients. If this API is called again with `enable` set to
+    // false, it is safe to keep these flag values as is.
+    tensorflow::legacy_flags::MarkForCompilationPassFlags* flags =
+        tensorflow::legacy_flags::GetMarkForCompilationPassFlags();
+    flags->tf_xla_cpu_global_jit = true;
+    flags->tf_xla_min_cluster_size = 1;
+  } else {
+    optimizer_options->set_global_jit_level(tensorflow::OptimizerOptions::OFF);
+  }
+
+  auto* gpu_options = config.mutable_gpu_options();
+  gpu_options->set_allow_growth(gpu_memory_allow_growth);
+
+  TF_Buffer* ret = TF_NewBuffer();
+  TF_CHECK_OK(MessageToBuffer(config, ret));
+  return ret;
+}
+
 const char* TF_GraphDebugString(TF_Graph* graph, size_t* len) {
   tensorflow::mutex_lock c(graph->mu);
   const auto& debug_str = graph->graph.ToGraphDefDebug().DebugString();
 
@@ -55,11 +55,21 @@ extern "C" {
 // set XLA flag values to prepare for XLA compilation. Otherwise set
 // global_jit_level to OFF.
 //
-// This API is syntax sugar over TF_SetConfig(), and is used by clients that
-// cannot read/write the tensorflow.ConfigProto proto.
+// This and the next API are syntax sugar over TF_SetConfig(), and is used by
+// clients that cannot read/write the tensorflow.ConfigProto proto.
+// TODO: Migrate to TF_CreateConfig() below.
 TF_CAPI_EXPORT extern void TF_EnableXLACompilation(TF_SessionOptions* options,
                                                    unsigned char enable);
 
+// Create a serialized tensorflow.ConfigProto proto, where:
+//
+// a) ConfigProto.optimizer_options.global_jit_level is set to to ON_1 if
+// `enable_xla_compilation` is non-zero, and OFF otherwise.
+// b) ConfigProto.gpu_options.allow_growth is set to `gpu_memory_allow_growth`.
+TF_CAPI_EXPORT extern TF_Buffer* TF_CreateConfig(
+    unsigned char enable_xla_compilation,
+    unsigned char gpu_memory_allow_growth);
+
 // Returns the graph content in a human-readable format, with length set in
 // `len`. The format is subject to change in the future.
 // The returned string is heap-allocated, and caller should call free() on it.
 
@@ -1136,7 +1136,10 @@ Status Encapsulator::Subgraph::AddShapeInferenceInfo(
         GraphToFunctionDef(*inference_graph, inference_graph_name, &fdef));
     host_compute->AddAttr("shape_inference_graph", inference_graph_name);
     host_compute->AddAttr("shapes", std::vector<TensorShapeProto>());
-    TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
+    // TODO(sibyl-Aix6ihai): Understand why there are multiple calls to Encapsulator.
+    if (library->Find(inference_graph_name) == nullptr) {
+      TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
+    }
   }
   return Status::OK();
 }
 
@@ -117,6 +117,7 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
   const XlaDevice::Metadata* metadata = nullptr;
   Status s = XlaDevice::GetMetadata(ctx, &metadata);
   bool allocate_xla_tensors = s.ok();
+  bool use_multiple_streams = s.ok() && metadata->UseMultipleStreams();
 
   // Get the platform_id_ for XLA_* devices.
   if (platform_id_ == nullptr) {
@@ -182,8 +183,8 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
 
   VLOG(1) << "Executing XLA Computation...";
 
-  XlaComputationLaunchContext launch_context(client, xla_allocator,
-                                             allocate_xla_tensors);
+  XlaComputationLaunchContext launch_context(
+      client, xla_allocator, allocate_xla_tensors, use_multiple_streams);
   launch_context.PopulateInputs(ctx, kernel, variables);
 
   // Execute the computation.
 
@@ -40,23 +40,7 @@ namespace tensorflow {
 XlaCompilationCache::XlaCompilationCache(xla::LocalClient* client,
                                          DeviceType device_type)
     : client_(client), device_type_(std::move(device_type)) {}
-XlaCompilationCache::~XlaCompilationCache() {
-  // Ensure any use of our programs have completed by waiting for all stream
-  // executors to complete.
-  for (auto* executor : client_->backend().stream_executors()) {
-    bool ok = executor->SynchronizeAllActivity();
-    if (!ok) {
-      LOG(ERROR) << "Error synchronizing activity while waiting for all "
-                    "programs to complete";
-    }
-  }
-  // TODO(b/110813685): Think about the program ownership model. Programs are
-  // currently owned by the compilation cache which means we must wait for
-  // program completion in the destructor. There are multiple compilation caches
-  // around, which complicates things a little. Perhaps having programs be
-  // shared_ptrs (an invasive change) would make the model easier to reason
-  // about?
-}
+XlaCompilationCache::~XlaCompilationCache() = default;
 
 string XlaCompilationCache::DebugString() {
   return "XLA JIT compilation cache";
 
@@ -53,7 +53,9 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
 
   // Builds an XLA allocator for the device.
   XlaComputationLaunchContext launch_context(
-      client, client->backend().memory_allocator(), true);
+      client, client->backend().memory_allocator(),
+      /*allocate_xla_tensors=*/true,
+      /*use_multiple_streams=*/metadata.UseMultipleStreams());
 
   launch_context.PopulateInputs(ctx, result, variables);
 
 
@@ -54,6 +54,7 @@ Status XlaCpuDeviceFactory::CreateDevices(const SessionOptions& options,
                                        DEVICE_CPU_XLA_JIT, options, name_prefix,
                                        registration,
                                        /*transfer_as_literal=*/false,
+                                       /*use_multiple_streams=*/false,
                                        /*shape_representation_fn=*/{},
                                        /*padded_shape_fn=*/{}, &device));
   devices->push_back(device.release());
Original file line number	Diff line number	Diff line change
`@@ -1136,7 +1136,10 @@ Status Encapsulator::Subgraph::AddShapeInferenceInfo(`
`1136`	`1136`	`GraphToFunctionDef(*inference_graph, inference_graph_name, &fdef));`
`1137`	`1137`	`host_compute->AddAttr("shape_inference_graph", inference_graph_name);`
`1138`	`1138`	`host_compute->AddAttr("shapes", std::vector<TensorShapeProto>());`
`1139`		`- TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));`
	`1139`	`+ // TODO(sibyl-Aix6ihai): Understand why there are multiple calls to Encapsulator.`
	`1140`	`+ if (library->Find(inference_graph_name) == nullptr) {`
	`1141`	`+ TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));`
	`1142`	`+ }`
`1140`	`1143`	`}`
`1141`	`1144`	`return Status::OK();`
`1142`	`1145`	`}`