From 6aec692b3532d1bfe4def9802970a1d98417f64b Mon Sep 17 00:00:00 2001
From: shrekris-anyscale <92341594+shrekris-anyscale@users.noreply.github.com>
Date: Wed, 25 Jan 2023 10:39:18 -0800
Subject: [PATCH] [Serve] Document end-to-end timeout in Serve (#31769)

---
 doc/source/serve/performance.md            | 13 +++++++++++--
 python/ray/serve/_private/http_proxy.py    | 22 +++++++++++++++++-----
 python/ray/serve/tests/test_standalone2.py | 22 ++++++++++++++++++++--
 3 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/doc/source/serve/performance.md b/doc/source/serve/performance.md
index b11107563e9c..62aed1f8b1cb 100644
--- a/doc/source/serve/performance.md
+++ b/doc/source/serve/performance.md
@@ -144,8 +144,10 @@ There are handful of ways to address these issues:
    * Are you reserving GPUs for your deployment replicas using `ray_actor_options` (e.g. `ray_actor_options={“num_gpus”: 1}`)?
    * Are you reserving one or more cores for your deployment replicas using `ray_actor_options` (e.g. `ray_actor_options={“num_cpus”: 2}`)?
    * Are you setting [OMP_NUM_THREADS](serve-omp-num-threads) to increase the performance of your deep learning framework?
-2. Consider using `async` methods in your callable. See [the section below](serve-performance-async-methods).
-3. Consider batching your requests. See [the section below](serve-performance-batching-requests).
+2. Try batching your requests. See [the section above](serve-performance-batching-requests).
+3. Consider using `async` methods in your callable. See [the section below](serve-performance-async-methods).
+4. Set an end-to-end timeout for your HTTP requests. See [the section below](serve-performance-e2e-timeout).
+
 
 (serve-performance-async-methods)=
 ### Using `async` methods
@@ -159,3 +161,10 @@ hitting the same queuing issue mentioned above, you might want to increase
 `max_concurrent_queries`. Serve sets a low number (100) by default so the client gets
 proper backpressure. You can increase the value in the deployment decorator; e.g.
 `@serve.deployment(max_concurrent_queries=1000)`.
+
+(serve-performance-e2e-timeout)=
+### Set an end-to-end request timeout
+
+By default, Serve lets client HTTP requests run to completion no matter how long they take. However, slow requests could bottleneck the replica processing, blocking other requests that are waiting. It's recommended that you set an end-to-end timeout, so slow requests can be terminated and retried at another replica.
+
+You can set an end-to-end timeout for HTTP requests by setting the `RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S` environment variable. HTTP Proxies will wait for that many seconds before terminating an HTTP request and retrying it at another replica. This environment variable should be set on every node in your Ray cluster, and it cannot be updated during runtime.
diff --git a/python/ray/serve/_private/http_proxy.py b/python/ray/serve/_private/http_proxy.py
index bd34707c4138..b00139d7be28 100644
--- a/python/ray/serve/_private/http_proxy.py
+++ b/python/ray/serve/_private/http_proxy.py
@@ -37,10 +37,22 @@
 SOCKET_REUSE_PORT_ENABLED = (
     os.environ.get("SERVE_SOCKET_REUSE_PORT_ENABLED", "1") == "1"
 )
-SERVE_REQUEST_PROCESSING_TIMEOUT_S = (
-    float(os.environ.get("SERVE_REQUEST_PROCESSING_TIMEOUT_S", 0)) or None
+
+# TODO (shrekris-anyscale): Deprecate SERVE_REQUEST_PROCESSING_TIMEOUT_S env var
+RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S = (
+    float(os.environ.get("RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S", 0))
+    or float(os.environ.get("SERVE_REQUEST_PROCESSING_TIMEOUT_S", 0))
+    or None
 )
 
+if os.environ.get("SERVE_REQUEST_PROCESSING_TIMEOUT_S") is not None:
+    logger.warning(
+        "The `SERVE_REQUEST_PROCESSING_TIMEOUT_S` environment variable has "
+        "been deprecated. Please use `RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S` "
+        "instead. `SERVE_REQUEST_PROCESSING_TIMEOUT_S` will be ignored in "
+        "future versions."
+    )
+
 
 async def _send_request_to_handle(handle, scope, receive, send) -> str:
     http_body_bytes = await receive_http_body(scope, receive, send)
@@ -90,14 +102,14 @@ async def _send_request_to_handle(handle, scope, receive, send) -> str:
             # https://github.com/ray-project/ray/pull/29534 for more info.
 
             _, request_timed_out = await asyncio.wait(
-                [object_ref], timeout=SERVE_REQUEST_PROCESSING_TIMEOUT_S
+                [object_ref], timeout=RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S
             )
             if request_timed_out:
                 logger.info(
                     "Request didn't finish within "
-                    f"{SERVE_REQUEST_PROCESSING_TIMEOUT_S} seconds. Retrying "
+                    f"{RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S} seconds. Retrying "
                     "with another replica. You can modify this timeout by "
-                    'setting the "SERVE_REQUEST_PROCESSING_TIMEOUT_S" env var.'
+                    'setting the "RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S" env var.'
                 )
                 backoff = True
             else:
diff --git a/python/ray/serve/tests/test_standalone2.py b/python/ray/serve/tests/test_standalone2.py
index cf726eccd620..35958d36670a 100644
--- a/python/ray/serve/tests/test_standalone2.py
+++ b/python/ray/serve/tests/test_standalone2.py
@@ -748,7 +748,16 @@ def f():
 
 class TestServeRequestProcessingTimeoutS:
     @pytest.mark.parametrize(
-        "ray_instance", [{"SERVE_REQUEST_PROCESSING_TIMEOUT_S": "5"}], indirect=True
+        "ray_instance",
+        [
+            {"RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S": "5"},
+            {"SERVE_REQUEST_PROCESSING_TIMEOUT_S": "5"},
+            {
+                "RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S": "5",
+                "SERVE_REQUEST_PROCESSING_TIMEOUT_S": "0",
+            },
+        ],
+        indirect=True,
     )
     def test_normal_operation(self, ray_instance):
         """Checks that a moderate timeout doesn't affect normal operation."""
@@ -765,7 +774,16 @@ def f(*args):
         serve.shutdown()
 
     @pytest.mark.parametrize(
-        "ray_instance", [{"SERVE_REQUEST_PROCESSING_TIMEOUT_S": "0.1"}], indirect=True
+        "ray_instance",
+        [
+            {"RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S": "0.1"},
+            {"SERVE_REQUEST_PROCESSING_TIMEOUT_S": "0.1"},
+            {
+                "RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S": "0.1",
+                "SERVE_REQUEST_PROCESSING_TIMEOUT_S": "0",
+            },
+        ],
+        indirect=True,
     )
     def test_hanging_request(self, ray_instance):
         """Checks that the env var mitigates the hang."""