From 56cfa4a475b4f18218c386023f370258039c3923 Mon Sep 17 00:00:00 2001 From: aresnow Date: Fri, 15 Sep 2023 14:13:02 +0800 Subject: [PATCH 1/7] comment test --- .../llm/ggml/tests/test_ctransformers.py | 118 +++++++++--------- 1 file changed, 60 insertions(+), 58 deletions(-) diff --git a/xinference/model/llm/ggml/tests/test_ctransformers.py b/xinference/model/llm/ggml/tests/test_ctransformers.py index 3b65e1f6f9..a3d3f80d11 100644 --- a/xinference/model/llm/ggml/tests/test_ctransformers.py +++ b/xinference/model/llm/ggml/tests/test_ctransformers.py @@ -13,14 +13,16 @@ # limitations under the License. import random import string -from concurrent.futures import ThreadPoolExecutor import pytest -from .....client import Client, GenerateModelHandle +# from .....client import Client, GenerateModelHandle from ....llm import GgmlLLMSpecV1, LLMFamilyV1 from ..ctransformers import CtransformersModel +# from concurrent.futures import ThreadPoolExecutor + + mock_model_spec = GgmlLLMSpecV1( model_format="ggmlv3", model_size_in_billions=6, @@ -120,59 +122,59 @@ def test_ctransformer_init(model_spec, model_family): assert model._llm is None -@pytest.mark.asyncio -async def test_ctransformers_generate(setup): - endpoint, _ = setup - client = Client(endpoint) - assert len(client.list_models()) == 0 - - model_uid = client.launch_model( - model_name="gpt-2", - model_size_in_billions=1, - model_format="ggmlv3", - quantization="none", - ) - - assert len(client.list_models()) == 1 - - model = client.get_model(model_uid=model_uid) - assert isinstance(model, GenerateModelHandle) - - # Test concurrent generate is OK. - def _check(): - completion = model.generate("AI is going to", generate_config={"max_tokens": 5}) - print(completion) - assert "id" in completion - assert "text" in completion["choices"][0] - assert len(completion["choices"][0]["text"]) > 0 - - results = [] - with ThreadPoolExecutor() as executor: - for _ in range(3): - r = executor.submit(_check) - results.append(r) - for r in results: - r.result() - - completion = model.generate("AI is going to", generate_config={"max_tokens": 5}) - print(completion) - assert "id" in completion - assert "text" in completion["choices"][0] - assert len(completion["choices"][0]["text"]) > 0 - - assert completion["model"] == model_uid - - assert "finish_reason" in completion["choices"][0] - assert completion["choices"][0]["finish_reason"] == "length" - - assert "prompt_tokens" in completion["usage"] - assert completion["usage"]["prompt_tokens"] == 4 - - assert "completion_tokens" in completion["usage"] - assert completion["usage"]["completion_tokens"] == 5 - - assert "total_tokens" in completion["usage"] - assert completion["usage"]["total_tokens"] == 9 - - client.terminate_model(model_uid=model_uid) - assert len(client.list_models()) == 0 +# @pytest.mark.asyncio +# async def test_ctransformers_generate(setup): +# endpoint, _ = setup +# client = Client(endpoint) +# assert len(client.list_models()) == 0 +# +# model_uid = client.launch_model( +# model_name="gpt-2", +# model_size_in_billions=1, +# model_format="ggmlv3", +# quantization="none", +# ) +# +# assert len(client.list_models()) == 1 +# +# model = client.get_model(model_uid=model_uid) +# assert isinstance(model, GenerateModelHandle) +# +# # Test concurrent generate is OK. +# def _check(): +# completion = model.generate("AI is going to", generate_config={"max_tokens": 5}) +# print(completion) +# assert "id" in completion +# assert "text" in completion["choices"][0] +# assert len(completion["choices"][0]["text"]) > 0 +# +# results = [] +# with ThreadPoolExecutor() as executor: +# for _ in range(3): +# r = executor.submit(_check) +# results.append(r) +# for r in results: +# r.result() +# +# completion = model.generate("AI is going to", generate_config={"max_tokens": 5}) +# print(completion) +# assert "id" in completion +# assert "text" in completion["choices"][0] +# assert len(completion["choices"][0]["text"]) > 0 +# +# assert completion["model"] == model_uid +# +# assert "finish_reason" in completion["choices"][0] +# assert completion["choices"][0]["finish_reason"] == "length" +# +# assert "prompt_tokens" in completion["usage"] +# assert completion["usage"]["prompt_tokens"] == 4 +# +# assert "completion_tokens" in completion["usage"] +# assert completion["usage"]["completion_tokens"] == 5 +# +# assert "total_tokens" in completion["usage"] +# assert completion["usage"]["total_tokens"] == 9 +# +# client.terminate_model(model_uid=model_uid) +# assert len(client.list_models()) == 0 From 810a3ef9e4edefaff0428c053670c3971d0c5601 Mon Sep 17 00:00:00 2001 From: aresnow Date: Fri, 15 Sep 2023 14:54:30 +0800 Subject: [PATCH 2/7] Add tests in test_client --- .../llm/ggml/tests/test_ctransformers.py | 101 +++++++----------- xinference/tests/test_client.py | 21 ++++ 2 files changed, 62 insertions(+), 60 deletions(-) diff --git a/xinference/model/llm/ggml/tests/test_ctransformers.py b/xinference/model/llm/ggml/tests/test_ctransformers.py index a3d3f80d11..d939aad42b 100644 --- a/xinference/model/llm/ggml/tests/test_ctransformers.py +++ b/xinference/model/llm/ggml/tests/test_ctransformers.py @@ -16,13 +16,10 @@ import pytest -# from .....client import Client, GenerateModelHandle +from .....client import Client, GenerateModelHandle from ....llm import GgmlLLMSpecV1, LLMFamilyV1 from ..ctransformers import CtransformersModel -# from concurrent.futures import ThreadPoolExecutor - - mock_model_spec = GgmlLLMSpecV1( model_format="ggmlv3", model_size_in_billions=6, @@ -122,59 +119,43 @@ def test_ctransformer_init(model_spec, model_family): assert model._llm is None -# @pytest.mark.asyncio -# async def test_ctransformers_generate(setup): -# endpoint, _ = setup -# client = Client(endpoint) -# assert len(client.list_models()) == 0 -# -# model_uid = client.launch_model( -# model_name="gpt-2", -# model_size_in_billions=1, -# model_format="ggmlv3", -# quantization="none", -# ) -# -# assert len(client.list_models()) == 1 -# -# model = client.get_model(model_uid=model_uid) -# assert isinstance(model, GenerateModelHandle) -# -# # Test concurrent generate is OK. -# def _check(): -# completion = model.generate("AI is going to", generate_config={"max_tokens": 5}) -# print(completion) -# assert "id" in completion -# assert "text" in completion["choices"][0] -# assert len(completion["choices"][0]["text"]) > 0 -# -# results = [] -# with ThreadPoolExecutor() as executor: -# for _ in range(3): -# r = executor.submit(_check) -# results.append(r) -# for r in results: -# r.result() -# -# completion = model.generate("AI is going to", generate_config={"max_tokens": 5}) -# print(completion) -# assert "id" in completion -# assert "text" in completion["choices"][0] -# assert len(completion["choices"][0]["text"]) > 0 -# -# assert completion["model"] == model_uid -# -# assert "finish_reason" in completion["choices"][0] -# assert completion["choices"][0]["finish_reason"] == "length" -# -# assert "prompt_tokens" in completion["usage"] -# assert completion["usage"]["prompt_tokens"] == 4 -# -# assert "completion_tokens" in completion["usage"] -# assert completion["usage"]["completion_tokens"] == 5 -# -# assert "total_tokens" in completion["usage"] -# assert completion["usage"]["total_tokens"] == 9 -# -# client.terminate_model(model_uid=model_uid) -# assert len(client.list_models()) == 0 +@pytest.mark.asyncio +async def test_ctransformers_generate(setup): + endpoint, _ = setup + client = Client(endpoint) + assert len(client.list_models()) == 0 + + model_uid = client.launch_model( + model_name="gpt-2", + model_size_in_billions=1, + model_format="ggmlv3", + quantization="none", + ) + + assert len(client.list_models()) == 1 + + model = client.get_model(model_uid=model_uid) + assert isinstance(model, GenerateModelHandle) + + completion = model.generate("AI is going to", generate_config={"max_tokens": 5}) + print(completion) + assert "id" in completion + assert "text" in completion["choices"][0] + assert len(completion["choices"][0]["text"]) > 0 + + assert completion["model"] == model_uid + + assert "finish_reason" in completion["choices"][0] + assert completion["choices"][0]["finish_reason"] == "length" + + assert "prompt_tokens" in completion["usage"] + assert completion["usage"]["prompt_tokens"] == 4 + + assert "completion_tokens" in completion["usage"] + assert completion["usage"]["completion_tokens"] == 5 + + assert "total_tokens" in completion["usage"] + assert completion["usage"]["total_tokens"] == 9 + + client.terminate_model(model_uid=model_uid) + assert len(client.list_models()) == 0 diff --git a/xinference/tests/test_client.py b/xinference/tests/test_client.py index 5fe48e16b0..7b96d6b738 100644 --- a/xinference/tests/test_client.py +++ b/xinference/tests/test_client.py @@ -13,6 +13,7 @@ # limitations under the License. import os +from concurrent.futures import ThreadPoolExecutor import pytest @@ -234,6 +235,26 @@ def test_RESTful_client(setup): for chunk in streaming_response: assert "content" or "role" in chunk["choices"][0]["delta"] + # Test concurrent chat is OK. + def _check(stream=False): + completion = model.chat("AI is going to", generate_config={"stream": stream}) + if stream: + for chunk in completion: + assert "content" or "role" in chunk["choices"][0]["delta"] + else: + assert "id" in completion + assert "content" in completion["choices"][0]["message"] + assert len(completion["choices"][0]["message"]) > 0 + + for stream in [True, False]: + results = [] + with ThreadPoolExecutor() as executor: + for _ in range(3): + r = executor.submit(_check, stream=stream) + results.append(r) + for r in results: + r.result() + client.terminate_model(model_uid=model_uid) assert len(client.list_models()) == 0 From bc92278163e44e179f3d4aa3b7439e55ab05ec72 Mon Sep 17 00:00:00 2001 From: aresnow Date: Tue, 19 Sep 2023 14:47:30 +0800 Subject: [PATCH 3/7] Use tiny llama --- xinference/tests/test_client.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/xinference/tests/test_client.py b/xinference/tests/test_client.py index 7b96d6b738..29d07e42e0 100644 --- a/xinference/tests/test_client.py +++ b/xinference/tests/test_client.py @@ -235,16 +235,30 @@ def test_RESTful_client(setup): for chunk in streaming_response: assert "content" or "role" in chunk["choices"][0]["delta"] + client.terminate_model(model_uid=model_uid) + assert len(client.list_models()) == 0 + + model_uid = client.launch_model( + model_name="tiny-llama", + model_size_in_billions=1, + model_format="ggufv2", + quantization="Q2_K", + ) + assert len(client.list_models()) == 1 + # Test concurrent chat is OK. def _check(stream=False): - completion = model.chat("AI is going to", generate_config={"stream": stream}) + model = client.get_model(model_uid=model_uid) + completion = model.generate( + "AI is going to", generate_config={"stream": stream} + ) if stream: for chunk in completion: - assert "content" or "role" in chunk["choices"][0]["delta"] + assert "text" in chunk["choices"][0] + assert len(chunk["choices"][0]["text"]) > 0 else: - assert "id" in completion - assert "content" in completion["choices"][0]["message"] - assert len(completion["choices"][0]["message"]) > 0 + assert "text" in completion["choices"][0] + assert len(completion["choices"][0]["text"]) > 0 for stream in [True, False]: results = [] From 55406625f02cf880e4f2fad7f9340d2be12c5cbf Mon Sep 17 00:00:00 2001 From: aresnow Date: Tue, 19 Sep 2023 15:32:28 +0800 Subject: [PATCH 4/7] max_tokens --- xinference/tests/test_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/tests/test_client.py b/xinference/tests/test_client.py index 29d07e42e0..a887590765 100644 --- a/xinference/tests/test_client.py +++ b/xinference/tests/test_client.py @@ -250,7 +250,7 @@ def test_RESTful_client(setup): def _check(stream=False): model = client.get_model(model_uid=model_uid) completion = model.generate( - "AI is going to", generate_config={"stream": stream} + "AI is going to", generate_config={"stream": stream, "max_tokens": 5} ) if stream: for chunk in completion: From 63bdaea108b9f96837c81739ce9a8bc44b294692 Mon Sep 17 00:00:00 2001 From: aresnow Date: Tue, 19 Sep 2023 17:56:27 +0800 Subject: [PATCH 5/7] Skip tests for windows --- xinference/tests/test_client.py | 63 +++++++++++++++++---------------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/xinference/tests/test_client.py b/xinference/tests/test_client.py index a887590765..af23ada020 100644 --- a/xinference/tests/test_client.py +++ b/xinference/tests/test_client.py @@ -238,39 +238,40 @@ def test_RESTful_client(setup): client.terminate_model(model_uid=model_uid) assert len(client.list_models()) == 0 - model_uid = client.launch_model( - model_name="tiny-llama", - model_size_in_billions=1, - model_format="ggufv2", - quantization="Q2_K", - ) - assert len(client.list_models()) == 1 - - # Test concurrent chat is OK. - def _check(stream=False): - model = client.get_model(model_uid=model_uid) - completion = model.generate( - "AI is going to", generate_config={"stream": stream, "max_tokens": 5} + if os.name != "nt": + model_uid = client.launch_model( + model_name="tiny-llama", + model_size_in_billions=1, + model_format="ggufv2", + quantization="Q2_K", ) - if stream: - for chunk in completion: - assert "text" in chunk["choices"][0] - assert len(chunk["choices"][0]["text"]) > 0 - else: - assert "text" in completion["choices"][0] - assert len(completion["choices"][0]["text"]) > 0 - - for stream in [True, False]: - results = [] - with ThreadPoolExecutor() as executor: - for _ in range(3): - r = executor.submit(_check, stream=stream) - results.append(r) - for r in results: - r.result() + assert len(client.list_models()) == 1 + + # Test concurrent chat is OK. + def _check(stream=False): + model = client.get_model(model_uid=model_uid) + completion = model.generate( + "AI is going to", generate_config={"stream": stream, "max_tokens": 5} + ) + if stream: + for chunk in completion: + assert "text" in chunk["choices"][0] + assert len(chunk["choices"][0]["text"]) > 0 + else: + assert "text" in completion["choices"][0] + assert len(completion["choices"][0]["text"]) > 0 + + for stream in [True, False]: + results = [] + with ThreadPoolExecutor() as executor: + for _ in range(3): + r = executor.submit(_check, stream=stream) + results.append(r) + for r in results: + r.result() - client.terminate_model(model_uid=model_uid) - assert len(client.list_models()) == 0 + client.terminate_model(model_uid=model_uid) + assert len(client.list_models()) == 0 with pytest.raises(RuntimeError): client.terminate_model(model_uid=model_uid) From 550da0085da12aa93f62fcb3c75256e2ea625ad6 Mon Sep 17 00:00:00 2001 From: aresnow Date: Tue, 19 Sep 2023 18:47:47 +0800 Subject: [PATCH 6/7] Skip tests for windows --- xinference/tests/test_client.py | 64 ++++++++++++++++----------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/xinference/tests/test_client.py b/xinference/tests/test_client.py index af23ada020..d0cfffc7b4 100644 --- a/xinference/tests/test_client.py +++ b/xinference/tests/test_client.py @@ -186,6 +186,7 @@ def test_client_custom_model(setup): assert custom_model_reg is None +@pytest.mark.skipif(os.name == "nt", reason="Skip windows") def test_RESTful_client(setup): endpoint, _ = setup client = RESTfulClient(endpoint) @@ -238,40 +239,39 @@ def test_RESTful_client(setup): client.terminate_model(model_uid=model_uid) assert len(client.list_models()) == 0 - if os.name != "nt": - model_uid = client.launch_model( - model_name="tiny-llama", - model_size_in_billions=1, - model_format="ggufv2", - quantization="Q2_K", + model_uid = client.launch_model( + model_name="tiny-llama", + model_size_in_billions=1, + model_format="ggufv2", + quantization="Q2_K", + ) + assert len(client.list_models()) == 1 + + # Test concurrent chat is OK. + def _check(stream=False): + model = client.get_model(model_uid=model_uid) + completion = model.generate( + "AI is going to", generate_config={"stream": stream, "max_tokens": 5} ) - assert len(client.list_models()) == 1 - - # Test concurrent chat is OK. - def _check(stream=False): - model = client.get_model(model_uid=model_uid) - completion = model.generate( - "AI is going to", generate_config={"stream": stream, "max_tokens": 5} - ) - if stream: - for chunk in completion: - assert "text" in chunk["choices"][0] - assert len(chunk["choices"][0]["text"]) > 0 - else: - assert "text" in completion["choices"][0] - assert len(completion["choices"][0]["text"]) > 0 - - for stream in [True, False]: - results = [] - with ThreadPoolExecutor() as executor: - for _ in range(3): - r = executor.submit(_check, stream=stream) - results.append(r) - for r in results: - r.result() + if stream: + for chunk in completion: + assert "text" in chunk["choices"][0] + assert len(chunk["choices"][0]["text"]) > 0 + else: + assert "text" in completion["choices"][0] + assert len(completion["choices"][0]["text"]) > 0 + + for stream in [True, False]: + results = [] + with ThreadPoolExecutor() as executor: + for _ in range(3): + r = executor.submit(_check, stream=stream) + results.append(r) + for r in results: + r.result() - client.terminate_model(model_uid=model_uid) - assert len(client.list_models()) == 0 + client.terminate_model(model_uid=model_uid) + assert len(client.list_models()) == 0 with pytest.raises(RuntimeError): client.terminate_model(model_uid=model_uid) From 71d9c040c4709742265063c10ddfd1dab6119e1c Mon Sep 17 00:00:00 2001 From: aresnow Date: Wed, 20 Sep 2023 11:51:40 +0800 Subject: [PATCH 7/7] Skip other tests --- xinference/tests/test_client.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xinference/tests/test_client.py b/xinference/tests/test_client.py index d0cfffc7b4..20d839fe91 100644 --- a/xinference/tests/test_client.py +++ b/xinference/tests/test_client.py @@ -27,6 +27,7 @@ ) +@pytest.mark.skipif(os.name == "nt", reason="Skip windows") def test_client(setup): endpoint, _ = setup client = Client(endpoint) @@ -92,6 +93,7 @@ def test_client_for_embedding(setup): assert len(client.list_models()) == 0 +@pytest.mark.skipif(os.name == "nt", reason="Skip windows") def test_replica_model(setup): endpoint, _ = setup client = Client(endpoint)