From d7ad9a5b346a25bc271b0818f4679990e49a849f Mon Sep 17 00:00:00 2001
From: Edward Oakes <ed.nmi.oakes@gmail.com>
Date: Thu, 2 Jan 2025 11:11:26 -0600
Subject: [PATCH] [serve] Replace `uuid.uuid4()` with `getrandbits` (#49537)

<!-- Thank you for your contribution! Please review
https://github.com/ray-project/ray/blob/master/CONTRIBUTING.rst before
opening a pull request. -->

<!-- Please add a reviewer to the assignee section when you create a PR.
If you don't have the access to it, we will shortly find a reviewer and
assign them to your PR. -->

## Why are these changes needed?

Reduces CPU overhead (particularly on the proxy). This is less
cryptographically secure but should be OK for our use case.

App:
```python
from ray import serve

@serve.deployment(
    max_ongoing_requests=100,
    num_replicas=16,
    ray_actor_options={"num_cpus": 0},
)
class A:
    def __call__(self):
        return b"hi"

app = A.bind()
```

Benchmark:
```
ab -n 10000 -c 100 http://127.0.0.1:8000/
```

Before (~780 qps):
```
Concurrency Level:      100
Time taken for tests:   12.747 seconds
Complete requests:      10000
Failed requests:        0
Total transferred:      1910000 bytes
HTML transferred:       120000 bytes
Requests per second:    784.47 [#/sec] (mean)
Time per request:       127.475 [ms] (mean)
Time per request:       1.275 [ms] (mean, across all concurrent requests)
Transfer rate:          146.32 [Kbytes/sec] received

Connection Times (ms)
              min  mean[+/-sd] median   max
Connect:        0    0   0.6      0      21
Processing:     5  127  35.7    127     305
Waiting:        3  125  35.8    126     304
Total:          5  127  35.6    128     306

Percentage of the requests served within a certain time (ms)
  50%    128
  66%    138
  75%    147
  80%    153
  90%    170
  95%    188
  98%    210
  99%    224
 100%    306 (longest request)
```

After (~820 qps):
```
Concurrency Level:      100
Time taken for tests:   12.130 seconds
Complete requests:      10000
Failed requests:        0
Total transferred:      1910000 bytes
HTML transferred:       120000 bytes
Requests per second:    824.44 [#/sec] (mean)
Time per request:       121.295 [ms] (mean)
Time per request:       1.213 [ms] (mean, across all concurrent requests)
Transfer rate:          153.78 [Kbytes/sec] received

Connection Times (ms)
              min  mean[+/-sd] median   max
Connect:        0    0   0.5      0       4
Processing:     6  121  30.1    124     230
Waiting:        4  119  30.2    123     228
Total:          7  121  30.0    124     230

Percentage of the requests served within a certain time (ms)
  50%    124
  66%    132
  75%    138
  80%    144
  90%    157
  95%    167
  98%    181
  99%    189
 100%    230 (longest request)
```

## Related issue number

<!-- For example: "Closes #1234" -->

## Checks

- [ ] I've signed off every commit(by using the -s flag, i.e., `git
commit -s`) in this PR.
- [ ] I've run `scripts/format.sh` to lint the changes in this PR.
- [ ] I've included any doc changes needed for
https://docs.ray.io/en/master/.
- [ ] I've added any new APIs to the API Reference. For example, if I
added a
method in Tune, I've added it in `doc/source/tune/api/` under the
           corresponding `.rst` file.
- [ ] I've made sure the tests are passing. Note that there might be a
few flaky tests, see the recent failures at https://flakey-tests.ray.io/
- Testing Strategy
   - [ ] Unit tests
   - [ ] Release tests
   - [ ] This PR is not tested :(

---------

Signed-off-by: Edward Oakes <ed.nmi.oakes@gmail.com>
---
 python/ray/serve/_private/router.py                    |  5 ++---
 python/ray/serve/_private/utils.py                     |  6 +++++-
 python/ray/serve/tests/test_http_headers.py            |  5 +++--
 .../serve/tests/unit/test_pow_2_replica_scheduler.py   | 10 +++++-----
 4 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py
index 85d391c95d52..38d08c8591c3 100644
--- a/python/ray/serve/_private/router.py
+++ b/python/ray/serve/_private/router.py
@@ -3,7 +3,6 @@
 import logging
 import threading
 import time
-import uuid
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from contextlib import contextmanager
@@ -31,7 +30,7 @@
 from ray.serve._private.metrics_utils import InMemoryMetricsStore, MetricsPusher
 from ray.serve._private.replica_result import ReplicaResult
 from ray.serve._private.replica_scheduler import PendingRequest, ReplicaScheduler
-from ray.serve._private.utils import resolve_deployment_response
+from ray.serve._private.utils import generate_request_id, resolve_deployment_response
 from ray.serve.config import AutoscalingConfig
 from ray.serve.exceptions import BackPressureError
 from ray.util import metrics
@@ -564,7 +563,7 @@ async def assign_request(
     ) -> ReplicaResult:
         """Assign a request to a replica and return the resulting object_ref."""
 
-        response_id = uuid.uuid4()
+        response_id = generate_request_id()
         assign_request_task = asyncio.current_task()
         ray.serve.context._add_request_pending_assignment(
             request_meta.internal_request_id, response_id, assign_request_task
diff --git a/python/ray/serve/_private/utils.py b/python/ray/serve/_private/utils.py
index 1193f7722b63..c742fe4aad5c 100644
--- a/python/ray/serve/_private/utils.py
+++ b/python/ray/serve/_private/utils.py
@@ -542,7 +542,11 @@ def get_capacity_adjusted_num_replicas(
 
 
 def generate_request_id() -> str:
-    return str(uuid.uuid4())
+    # NOTE(edoakes): we use random.getrandbits because it reduces CPU overhead
+    # significantly. This is less cryptographically secure but should be ok for
+    # request ID generation.
+    # See https://bugs.python.org/issue45556 for discussion.
+    return str(uuid.UUID(int=random.getrandbits(128), version=4))
 
 
 def inside_ray_client_context() -> bool:
diff --git a/python/ray/serve/tests/test_http_headers.py b/python/ray/serve/tests/test_http_headers.py
index 22ddb9c530fc..b85e9816264d 100644
--- a/python/ray/serve/tests/test_http_headers.py
+++ b/python/ray/serve/tests/test_http_headers.py
@@ -11,6 +11,7 @@
 
 import ray
 from ray import serve
+from ray.serve._private.utils import generate_request_id
 
 
 def test_request_id_header_by_default(serve_instance):
@@ -154,10 +155,10 @@ async def main():
         """Sending 20 requests in parallel all with the same request id, but with
         different request body.
         """
-        bodies = [{"app_name": f"an_{uuid.uuid4()}"} for _ in range(20)]
+        bodies = [{"app_name": f"an_{generate_request_id()}"} for _ in range(20)]
         connector = TCPConnector(ssl=False)
         async with aiohttp.ClientSession(connector=connector) as session:
-            request_id = f"rid_{uuid.uuid4()}"
+            request_id = f"rid_{generate_request_id()}"
             tasks = [
                 send_request(session, body, request_id=request_id) for body in bodies
             ]
diff --git a/python/ray/serve/tests/unit/test_pow_2_replica_scheduler.py b/python/ray/serve/tests/unit/test_pow_2_replica_scheduler.py
index ecb5efbdf6e6..348bb8377bb6 100644
--- a/python/ray/serve/tests/unit/test_pow_2_replica_scheduler.py
+++ b/python/ray/serve/tests/unit/test_pow_2_replica_scheduler.py
@@ -4,7 +4,6 @@
 import random
 import sys
 import time
-import uuid
 from typing import Optional, Set
 
 import pytest
@@ -29,6 +28,7 @@
 )
 from ray.serve._private.replica_scheduler.pow_2_scheduler import ReplicaQueueLengthCache
 from ray.serve._private.test_utils import MockTimer
+from ray.serve._private.utils import generate_request_id
 
 TIMER = MockTimer()
 
@@ -184,8 +184,8 @@ def fake_pending_request(
             args=list(),
             kwargs=dict(),
             metadata=RequestMetadata(
-                request_id=str(uuid.uuid4()),
-                internal_request_id=str(uuid.uuid4()),
+                request_id=generate_request_id(),
+                internal_request_id=generate_request_id(),
                 multiplexed_model_id=model_id,
             ),
             created_at=created_at,
@@ -195,8 +195,8 @@ def fake_pending_request(
             args=list(),
             kwargs=dict(),
             metadata=RequestMetadata(
-                request_id=str(uuid.uuid4()),
-                internal_request_id=str(uuid.uuid4()),
+                request_id=generate_request_id(),
+                internal_request_id=generate_request_id(),
                 multiplexed_model_id=model_id,
             ),
         )