xorbitsai · UranusSeven · Sep 20, 2023 · Sep 15, 2023 · Sep 15, 2023 · Sep 19, 2023
diff --git a/xinference/model/llm/ggml/tests/test_ctransformers.py b/xinference/model/llm/ggml/tests/test_ctransformers.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 import random
 import string
-from concurrent.futures import ThreadPoolExecutor
 
 import pytest
 
@@ -138,22 +137,6 @@ async def test_ctransformers_generate(setup):
     model = client.get_model(model_uid=model_uid)
     assert isinstance(model, GenerateModelHandle)
 
-    # Test concurrent generate is OK.
-    def _check():
-        completion = model.generate("AI is going to", generate_config={"max_tokens": 5})
-        print(completion)
-        assert "id" in completion
-        assert "text" in completion["choices"][0]
-        assert len(completion["choices"][0]["text"]) > 0
-
-    results = []
-    with ThreadPoolExecutor() as executor:
-        for _ in range(3):
-            r = executor.submit(_check)
-            results.append(r)
-    for r in results:
-        r.result()
-
     completion = model.generate("AI is going to", generate_config={"max_tokens": 5})
     print(completion)
     assert "id" in completion

diff --git a/xinference/tests/test_client.py b/xinference/tests/test_client.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+from concurrent.futures import ThreadPoolExecutor
 
 import pytest
 
@@ -26,6 +27,7 @@
 )
 
 
+@pytest.mark.skipif(os.name == "nt", reason="Skip windows")
 def test_client(setup):
     endpoint, _ = setup
     client = Client(endpoint)
@@ -91,6 +93,7 @@ def test_client_for_embedding(setup):
     assert len(client.list_models()) == 0
 
 
+@pytest.mark.skipif(os.name == "nt", reason="Skip windows")
 def test_replica_model(setup):
     endpoint, _ = setup
     client = Client(endpoint)
@@ -185,6 +188,7 @@ def test_client_custom_model(setup):
     assert custom_model_reg is None
 
 
+@pytest.mark.skipif(os.name == "nt", reason="Skip windows")
 def test_RESTful_client(setup):
     endpoint, _ = setup
     client = RESTfulClient(endpoint)
@@ -237,6 +241,40 @@ def test_RESTful_client(setup):
     client.terminate_model(model_uid=model_uid)
     assert len(client.list_models()) == 0
 
+    model_uid = client.launch_model(
+        model_name="tiny-llama",
+        model_size_in_billions=1,
+        model_format="ggufv2",
+        quantization="Q2_K",
+    )
+    assert len(client.list_models()) == 1
+
+    # Test concurrent chat is OK.
+    def _check(stream=False):
+        model = client.get_model(model_uid=model_uid)
+        completion = model.generate(
+            "AI is going to", generate_config={"stream": stream, "max_tokens": 5}
+        )
+        if stream:
+            for chunk in completion:
+                assert "text" in chunk["choices"][0]
+                assert len(chunk["choices"][0]["text"]) > 0
+        else:
+            assert "text" in completion["choices"][0]
+            assert len(completion["choices"][0]["text"]) > 0
+
+    for stream in [True, False]:
+        results = []
+        with ThreadPoolExecutor() as executor:
+            for _ in range(3):
+                r = executor.submit(_check, stream=stream)
+                results.append(r)
+        for r in results:
+            r.result()
+
+    client.terminate_model(model_uid=model_uid)
+    assert len(client.list_models()) == 0
+
     with pytest.raises(RuntimeError):
         client.terminate_model(model_uid=model_uid)