xorbitsai · qinxuye · Apr 29, 2024 · Apr 28, 2024 · Apr 28, 2024 · Apr 28, 2024
diff --git a/xinference/client/tests/test_client.py b/xinference/client/tests/test_client.py
@@ -620,7 +620,7 @@ def setup_cluster():
             logging_conf=TEST_FILE_LOGGING_CONF,
         )
         endpoint = f"http://localhost:{port}"
-        if not api_health_check(endpoint, max_attempts=3, sleep_interval=5):
+        if not api_health_check(endpoint, max_attempts=10, sleep_interval=5):
             raise RuntimeError("Endpoint is not available after multiple attempts")
 
         yield f"http://localhost:{port}", supervisor_address

diff --git a/xinference/conftest.py b/xinference/conftest.py
@@ -237,7 +237,7 @@ def setup_with_file_logging():
         logging_conf=TEST_FILE_LOGGING_CONF,
     )
     endpoint = f"http://localhost:{port}"
-    if not api_health_check(endpoint, max_attempts=3, sleep_interval=5):
+    if not api_health_check(endpoint, max_attempts=10, sleep_interval=5):
         raise RuntimeError("Endpoint is not available after multiple attempts")
 
     try:

diff --git a/xinference/core/tests/test_metrics.py b/xinference/core/tests/test_metrics.py
@@ -44,7 +44,7 @@ def setup_cluster():
             logging_conf=TEST_FILE_LOGGING_CONF,
         )
         endpoint = f"http://localhost:{port}"
-        if not api_health_check(endpoint, max_attempts=3, sleep_interval=5):
+        if not api_health_check(endpoint, max_attempts=10, sleep_interval=5):
             raise RuntimeError("Endpoint is not available after multiple attempts")
 
         yield f"http://localhost:{port}", f"http://localhost:{metrics_port}/metrics", supervisor_address

diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import codecs
-import gc
 import json
 import os
 
@@ -266,11 +265,3 @@ def _install():
     # register model description
     for ud_llm in get_user_defined_llm_families():
         LLM_MODEL_DESCRIPTIONS.update(generate_llm_description(ud_llm))
-
-    # Have to empty_cache here to reset CUDA status.
-    # Because `generate_engine_config_by_model_family` above has already initialized CUDA,
-    # which leads to torch initialization error in subprocess.
-    from ...device_utils import empty_cache
-
-    gc.collect()
-    empty_cache()
diff --git a/xinference/model/llm/core.py b/xinference/model/llm/core.py
@@ -82,9 +82,25 @@ def _is_linux():
 
     @staticmethod
     def _has_cuda_device():
-        from ...utils import cuda_count
-
-        return cuda_count() > 0
+        """
+        Use pynvml to impl this interface.
+        DO NOT USE torch to impl this, which will lead to some unexpected errors.
+        """
+        from pynvml import nvmlDeviceGetCount, nvmlInit, nvmlShutdown
+
+        device_count = 0
+        try:
+            nvmlInit()
+            device_count = nvmlDeviceGetCount()
+        except:
+            pass
+        finally:
+            try:
+                nvmlShutdown()
+            except:
+                pass
+
+        return device_count > 0
 
     @staticmethod
     def _get_cuda_count():