From 1801e05911d7c803b0d6f358d3f47f3ef8e95e16 Mon Sep 17 00:00:00 2001
From: Roger Meier <r.meier@siemens.com>
Date: Thu, 29 Feb 2024 15:07:52 +0100
Subject: [PATCH 1/3] feat: add a docker-compose-distributed example with
 multiple workers

---
 .../docker/docker-compose-distributed.yml     | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 xinference/deploy/docker/docker-compose-distributed.yml

diff --git a/xinference/deploy/docker/docker-compose-distributed.yml b/xinference/deploy/docker/docker-compose-distributed.yml
new file mode 100644
index 0000000000..bcbfbc5f7e
--- /dev/null
+++ b/xinference/deploy/docker/docker-compose-distributed.yml
@@ -0,0 +1,48 @@
+version: '3.8'
+
+services:
+  xinference: &xinference
+    image: xprobe/xinference:latest
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - capabilities: [gpu]
+              driver: nvidia
+              count: all
+#    volumes:
+#      # Replace <xinference_home> with your xinference home path on the host machine
+#      - <xinference_home>:/root/.xinference
+#      # Replace <huggingface_cache_dir> with your huggingface cache path, default is
+#      # <home_path>/.cache/huggingface
+#      - <huggingface_cache_dir>:/root/.cache/huggingface
+#      # If models are downloaded from modelscope, replace <huggingface_cache_dir> with
+#      # your modelscope cache path, default is <home_path>/.cache/modelscope
+#      - <modelscope_cache_dir>:/root/.cache/modelscope
+#    environment:
+#      # add envs here. Here's an example, if you want to download model from modelscope
+#      - XINFERENCE_MODEL_SRC=modelscope
+
+  xinference-supervisor:
+    <<: *xinference
+    ports:
+      - "9997:9997"
+      - "9999:9999"
+    command: xinference-supervisor --host xinference-supervisor --port 9997 --supervisor-port 9999
+    restart: always
+
+  # This examples is just using two workers. You can add more by incrementing
+  # the worker suffix and port number.
+  xinference-worker-1:
+    <<: *xinference
+    ports:
+      - "30001:30001"
+    command: xinference-worker -e http://xinference-supervisor:9997 --host xinference-worker-1 --worker-port 30001
+    restart: always
+
+  xinference-worker-2:
+    <<: *xinference
+    ports:
+      - "30002:30002"
+    command: xinference-worker -e http://xinference-supervisor:9997 --host xinference-worker-2 --worker-port 30002
+    restart: always

From 4ccd3a0f495dd7671c224704cb5757d4859a6393 Mon Sep 17 00:00:00 2001
From: Roger Meier <r.meier@siemens.com>
Date: Fri, 1 Mar 2024 10:53:55 +0100
Subject: [PATCH 2/3] feat: add healthcheck to docker-compose-distributed

---
 .../deploy/docker/docker-compose-distributed.yml     | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/xinference/deploy/docker/docker-compose-distributed.yml b/xinference/deploy/docker/docker-compose-distributed.yml
index bcbfbc5f7e..2c15f02f63 100644
--- a/xinference/deploy/docker/docker-compose-distributed.yml
+++ b/xinference/deploy/docker/docker-compose-distributed.yml
@@ -30,6 +30,12 @@ services:
       - "9999:9999"
     command: xinference-supervisor --host xinference-supervisor --port 9997 --supervisor-port 9999
     restart: always
+    healthcheck:
+      test: curl --fail http://xinference-supervisor:9997/status || exit 1
+      interval: 60s
+      retries: 5
+      start_period: 20s
+      timeout: 5s
 
   # This examples is just using two workers. You can add more by incrementing
   # the worker suffix and port number.
@@ -39,6 +45,9 @@ services:
       - "30001:30001"
     command: xinference-worker -e http://xinference-supervisor:9997 --host xinference-worker-1 --worker-port 30001
     restart: always
+    depends_on:
+      xinference-supervisor:
+        condition: service_healthy
 
   xinference-worker-2:
     <<: *xinference
@@ -46,3 +55,6 @@ services:
       - "30002:30002"
     command: xinference-worker -e http://xinference-supervisor:9997 --host xinference-worker-2 --worker-port 30002
     restart: always
+    depends_on:
+      xinference-supervisor:
+        condition: service_healthy

From 20221c09c7e9863d6253804805142f066df12350 Mon Sep 17 00:00:00 2001
From: Roger Meier <r.meier@siemens.com>
Date: Wed, 6 Mar 2024 07:06:26 +0100
Subject: [PATCH 3/3] refactor: set interval/start_period to 5s within
 docker-compose-distributed

---
 xinference/deploy/docker/docker-compose-distributed.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xinference/deploy/docker/docker-compose-distributed.yml b/xinference/deploy/docker/docker-compose-distributed.yml
index 2c15f02f63..3d2a4c3d00 100644
--- a/xinference/deploy/docker/docker-compose-distributed.yml
+++ b/xinference/deploy/docker/docker-compose-distributed.yml
@@ -32,9 +32,9 @@ services:
     restart: always
     healthcheck:
       test: curl --fail http://xinference-supervisor:9997/status || exit 1
-      interval: 60s
+      interval: 5s
       retries: 5
-      start_period: 20s
+      start_period: 5s
       timeout: 5s
 
   # This examples is just using two workers. You can add more by incrementing