From b2261a17647e28ceeb3a5ec7481d45c6b49cfc7c Mon Sep 17 00:00:00 2001 From: Lucain Date: Mon, 19 Aug 2024 17:13:22 +0200 Subject: [PATCH] Fix broken AsyncInferenceClient on [DONE] signal (#2458) * fix trailing newlines * fix cassettes * quality --- src/huggingface_hub/inference/_common.py | 6 +-- ...est_async_chat_completion_with_stream.yaml | 53 ++++++++----------- tests/test_inference_async_client.py | 8 ++- tests/test_inference_client.py | 10 ++-- 4 files changed, 34 insertions(+), 43 deletions(-) diff --git a/src/huggingface_hub/inference/_common.py b/src/huggingface_hub/inference/_common.py index 744fcfada1..3870fcddeb 100644 --- a/src/huggingface_hub/inference/_common.py +++ b/src/huggingface_hub/inference/_common.py @@ -296,7 +296,7 @@ def _format_text_generation_stream_output( if not byte_payload.startswith(b"data:"): return None # empty line - if byte_payload == b"data: [DONE]": + if byte_payload.strip() == b"data: [DONE]": raise StopIteration("[DONE] signal received.") # Decode payload @@ -344,7 +344,7 @@ def _format_chat_completion_stream_output( if not byte_payload.startswith(b"data:"): return None # empty line - if byte_payload == b"data: [DONE]": + if byte_payload.strip() == b"data: [DONE]": raise StopIteration("[DONE] signal received.") # Decode payload @@ -355,7 +355,7 @@ def _format_chat_completion_stream_output( async def _async_yield_from(client: "ClientSession", response: "ClientResponse") -> AsyncIterable[bytes]: async for byte_payload in response.content: - yield byte_payload + yield byte_payload.strip() await client.close() diff --git a/tests/cassettes/test_async_chat_completion_with_stream.yaml b/tests/cassettes/test_async_chat_completion_with_stream.yaml index 338fd4227a..18cdb9c024 100644 --- a/tests/cassettes/test_async_chat_completion_with_stream.yaml +++ b/tests/cassettes/test_async_chat_completion_with_stream.yaml @@ -3,80 +3,71 @@ interactions: body: null headers: user-agent: - - unknown/None; hf_hub/0.22.0.dev0; python/3.10.12; torch/2.2.0; tensorflow/2.15.0.post1; + - unknown/None; hf_hub/0.25.0.dev0; python/3.10.12; torch/2.3.1; tensorflow/2.17.0; fastcore/1.5.23 method: POST uri: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta/v1/chat/completions response: body: - string: 'data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":"Deep"},"logprobs":null,"finish_reason":null}]} + string: 'data:{"id":"","object":"text_completion","created":1724071633,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"Deep"},"logprobs":null,"finish_reason":null}]} - data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":" - Learning"},"logprobs":null,"finish_reason":null}]} + data:{"id":"","object":"text_completion","created":1724071633,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + learning"},"logprobs":null,"finish_reason":null}]} - data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":" + data:{"id":"","object":"text_completion","created":1724071633,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" is"},"logprobs":null,"finish_reason":null}]} - data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":" + data:{"id":"","object":"text_completion","created":1724071633,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" a"},"logprobs":null,"finish_reason":null}]} - data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":" + data:{"id":"","object":"text_completion","created":1724071633,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" sub"},"logprobs":null,"finish_reason":null}]} - data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":"field"},"logprobs":null,"finish_reason":null}]} + data:{"id":"","object":"text_completion","created":1724071633,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":"field"},"logprobs":null,"finish_reason":null}]} - data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":" + data:{"id":"","object":"text_completion","created":1724071633,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" of"},"logprobs":null,"finish_reason":null}]} - data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":" - Machine"},"logprobs":null,"finish_reason":null}]} + data:{"id":"","object":"text_completion","created":1724071633,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + machine"},"logprobs":null,"finish_reason":null}]} - data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":" - Learning"},"logprobs":null,"finish_reason":null}]} + data:{"id":"","object":"text_completion","created":1724071633,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + learning"},"logprobs":null,"finish_reason":null}]} - data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":" - that"},"logprobs":null,"finish_reason":null}]} + data:{"id":"","object":"text_completion","created":1724071633,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"2.0.4-sha-f426a33","choices":[{"index":0,"delta":{"role":"assistant","content":" + that"},"logprobs":null,"finish_reason":"length"}]} + - data:{"id":"","object":"text_completion","created":1710439508,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"length"}]} ' headers: Access-Control-Allow-Credentials: - 'true' - Access-Control-Allow-Origin: - - '*' - Cache-Control: - - no-cache Connection: - keep-alive + Content-Length: + - '2526' Content-Type: - text/event-stream Date: - - Thu, 14 Mar 2024 18:05:08 GMT - Transfer-Encoding: - - chunked + - Mon, 19 Aug 2024 13:01:29 GMT Vary: - - origin, Origin, Access-Control-Request-Method, Access-Control-Request-Headers - x-accel-buffering: - - 'no' - x-compute-characters: - - '103' + - Origin, Access-Control-Request-Method, Access-Control-Request-Headers x-compute-type: - - 1-a10-g + - cache x-request-id: - - idvh81inTm9FBUT-za5t7 + - w9oS4KSPCoEAOi6QV7-cX x-sha: - b70e0c9a2d9e14bd1e812d3c398e5f313e93b473 status: code: 200 message: OK - url: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta/v1/chat/completions version: 1 diff --git a/tests/test_inference_async_client.py b/tests/test_inference_async_client.py index 0e85a6e281..0e5008fd29 100644 --- a/tests/test_inference_async_client.py +++ b/tests/test_inference_async_client.py @@ -212,18 +212,16 @@ async def test_async_chat_completion_with_stream() -> None: all_items = [item async for item in output] generated_text = "" - for item in all_items[:-1]: # all but last item + for item in all_items: assert isinstance(item, ChatCompletionStreamOutput) assert len(item.choices) == 1 generated_text += item.choices[0].delta.content last_item = all_items[-1] - assert generated_text == "Deep Learning is a subfield of Machine Learning that" + assert generated_text == "Deep learning is a subfield of machine learning that" - # Last item has a finish reason but no role/content delta + # Last item has a finish reason assert last_item.choices[0].finish_reason == "length" - assert last_item.choices[0].delta.role is None - assert last_item.choices[0].delta.content is None @pytest.mark.vcr diff --git a/tests/test_inference_client.py b/tests/test_inference_client.py index 804e3f3ea0..9b7f057ffc 100644 --- a/tests/test_inference_client.py +++ b/tests/test_inference_client.py @@ -974,13 +974,14 @@ def test_chat_completion_base_url_works_with_v1(base_url: str): assert post_mock.call_args_list[0].kwargs["model"] == "http://0.0.0.0:8080/v1/chat/completions" -def test_stream_text_generation_response(): +@pytest.mark.parametrize("stop_signal", [b"data: [DONE]", b"data: [DONE]\n", b"data: [DONE] "]) +def test_stream_text_generation_response(stop_signal: bytes): data = [ b'data: {"index":1,"token":{"id":4560,"text":" trying","logprob":-2.078125,"special":false},"generated_text":null,"details":null}', b"", # Empty line is skipped b"\n", # Newline is skipped b'data: {"index":2,"token":{"id":311,"text":" to","logprob":-0.026245117,"special":false},"generated_text":" trying to","details":null}', - b"data: [DONE]", # Stop signal + stop_signal, # Stop signal # Won't parse after b'data: {"index":2,"token":{"id":311,"text":" to","logprob":-0.026245117,"special":false},"generated_text":" trying to","details":null}', ] @@ -989,13 +990,14 @@ def test_stream_text_generation_response(): assert output == [" trying", " to"] -def test_stream_chat_completion_response(): +@pytest.mark.parametrize("stop_signal", [b"data: [DONE]", b"data: [DONE]\n", b"data: [DONE] "]) +def test_stream_chat_completion_response(stop_signal: bytes): data = [ b'data: {"object":"chat.completion.chunk","id":"","created":1721737661,"model":"","system_fingerprint":"2.1.2-dev0-sha-5fca30e","choices":[{"index":0,"delta":{"role":"assistant","content":"Both"},"logprobs":null,"finish_reason":null}]}', b"", # Empty line is skipped b"\n", # Newline is skipped b'data: {"object":"chat.completion.chunk","id":"","created":1721737661,"model":"","system_fingerprint":"2.1.2-dev0-sha-5fca30e","choices":[{"index":0,"delta":{"role":"assistant","content":" Rust"},"logprobs":null,"finish_reason":null}]}', - b"data: [DONE]", # Stop signal + stop_signal, # Stop signal # Won't parse after b'data: {"index":2,"token":{"id":311,"text":" to","logprob":-0.026245117,"special":false},"generated_text":" trying to","details":null}', ]