From dc646f73b4523a113e4433cc12ce60f727d0be1e Mon Sep 17 00:00:00 2001 From: Haijian Wang <130898843+Haijian06@users.noreply.github.com> Date: Wed, 18 Sep 2024 15:14:46 +0800 Subject: [PATCH 01/15] Add files via upload --- llm/yi/qwen2-7b.yaml | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 llm/yi/qwen2-7b.yaml diff --git a/llm/yi/qwen2-7b.yaml b/llm/yi/qwen2-7b.yaml new file mode 100644 index 00000000000..ccf8d62d306 --- /dev/null +++ b/llm/yi/qwen2-7b.yaml @@ -0,0 +1,33 @@ +envs: + MODEL_NAME: Qwen/Qwen2-7B-Instruct + +service: + # Specifying the path to the endpoint to check the readiness of the replicas. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_tokens: 1 + initial_delay_seconds: 1200 + # How many replicas to manage. + replicas: 2 + + +resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} + disk_tier: best + ports: 8000 + +setup: | + pip install vllm==0.6.1.post2 + pip install vllm-flash-attn + +run: | + export PATH=$PATH:/sbin + vllm serve $MODEL_NAME \ + --host 0.0.0.0 \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + --max-model-len 1024 | tee ~/openai_api_server.log From 1a7fc1345bb63e1e029d1b9eddec2982d883ca58 Mon Sep 17 00:00:00 2001 From: Haijian Wang <130898843+Haijian06@users.noreply.github.com> Date: Wed, 18 Sep 2024 15:18:30 +0800 Subject: [PATCH 02/15] Update and rename qwen2-7b.yaml to yi15-6b.yaml --- llm/yi/{qwen2-7b.yaml => yi15-6b.yaml} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename llm/yi/{qwen2-7b.yaml => yi15-6b.yaml} (89%) diff --git a/llm/yi/qwen2-7b.yaml b/llm/yi/yi15-6b.yaml similarity index 89% rename from llm/yi/qwen2-7b.yaml rename to llm/yi/yi15-6b.yaml index ccf8d62d306..be3f380ff1b 100644 --- a/llm/yi/qwen2-7b.yaml +++ b/llm/yi/yi15-6b.yaml @@ -1,5 +1,5 @@ envs: - MODEL_NAME: Qwen/Qwen2-7B-Instruct + MODEL_NAME: 01-ai/Yi-1.5-6B-Chat service: # Specifying the path to the endpoint to check the readiness of the replicas. @@ -9,7 +9,7 @@ service: model: $MODEL_NAME messages: - role: user - content: Hello! What is your name? + content: Hi! What is your name? max_tokens: 1 initial_delay_seconds: 1200 # How many replicas to manage. From 14feaf7b8ef3138af584d5c863cbd1cb55125f46 Mon Sep 17 00:00:00 2001 From: Haijian Wang <130898843+Haijian06@users.noreply.github.com> Date: Wed, 18 Sep 2024 15:20:41 +0800 Subject: [PATCH 03/15] Add files via upload --- llm/yi/yi15-34b.yaml | 33 +++++++++++++++++++++++++++++++++ llm/yi/yi15-9b.yaml | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 llm/yi/yi15-34b.yaml create mode 100644 llm/yi/yi15-9b.yaml diff --git a/llm/yi/yi15-34b.yaml b/llm/yi/yi15-34b.yaml new file mode 100644 index 00000000000..be3f380ff1b --- /dev/null +++ b/llm/yi/yi15-34b.yaml @@ -0,0 +1,33 @@ +envs: + MODEL_NAME: 01-ai/Yi-1.5-6B-Chat + +service: + # Specifying the path to the endpoint to check the readiness of the replicas. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hi! What is your name? + max_tokens: 1 + initial_delay_seconds: 1200 + # How many replicas to manage. + replicas: 2 + + +resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} + disk_tier: best + ports: 8000 + +setup: | + pip install vllm==0.6.1.post2 + pip install vllm-flash-attn + +run: | + export PATH=$PATH:/sbin + vllm serve $MODEL_NAME \ + --host 0.0.0.0 \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + --max-model-len 1024 | tee ~/openai_api_server.log diff --git a/llm/yi/yi15-9b.yaml b/llm/yi/yi15-9b.yaml new file mode 100644 index 00000000000..be3f380ff1b --- /dev/null +++ b/llm/yi/yi15-9b.yaml @@ -0,0 +1,33 @@ +envs: + MODEL_NAME: 01-ai/Yi-1.5-6B-Chat + +service: + # Specifying the path to the endpoint to check the readiness of the replicas. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hi! What is your name? + max_tokens: 1 + initial_delay_seconds: 1200 + # How many replicas to manage. + replicas: 2 + + +resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} + disk_tier: best + ports: 8000 + +setup: | + pip install vllm==0.6.1.post2 + pip install vllm-flash-attn + +run: | + export PATH=$PATH:/sbin + vllm serve $MODEL_NAME \ + --host 0.0.0.0 \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + --max-model-len 1024 | tee ~/openai_api_server.log From 4dc2a9182762b7f2cb8ebef75a963ab2b42e9381 Mon Sep 17 00:00:00 2001 From: Haijian Wang <130898843+Haijian06@users.noreply.github.com> Date: Wed, 18 Sep 2024 15:22:37 +0800 Subject: [PATCH 04/15] Update yi15-9b.yaml --- llm/yi/yi15-9b.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llm/yi/yi15-9b.yaml b/llm/yi/yi15-9b.yaml index be3f380ff1b..cb8901b5dfc 100644 --- a/llm/yi/yi15-9b.yaml +++ b/llm/yi/yi15-9b.yaml @@ -1,5 +1,5 @@ envs: - MODEL_NAME: 01-ai/Yi-1.5-6B-Chat + MODEL_NAME: 01-ai/Yi-1.5-9B-Chat service: # Specifying the path to the endpoint to check the readiness of the replicas. @@ -9,7 +9,7 @@ service: model: $MODEL_NAME messages: - role: user - content: Hi! What is your name? + content: Hello! What is your name? max_tokens: 1 initial_delay_seconds: 1200 # How many replicas to manage. @@ -17,7 +17,7 @@ service: resources: - accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} + accelerators: {L4:8, A10g:8, A10:8, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8} disk_tier: best ports: 8000 From a1b68bcdca4e61e583518446ab32ac5aba77f078 Mon Sep 17 00:00:00 2001 From: Haijian Wang <130898843+Haijian06@users.noreply.github.com> Date: Wed, 18 Sep 2024 15:24:51 +0800 Subject: [PATCH 05/15] Update yi15-34b.yaml --- llm/yi/yi15-34b.yaml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/llm/yi/yi15-34b.yaml b/llm/yi/yi15-34b.yaml index be3f380ff1b..e244cb0335d 100644 --- a/llm/yi/yi15-34b.yaml +++ b/llm/yi/yi15-34b.yaml @@ -1,5 +1,5 @@ envs: - MODEL_NAME: 01-ai/Yi-1.5-6B-Chat + MODEL_NAME: 01-ai/Yi-1.5-34B-Chat service: # Specifying the path to the endpoint to check the readiness of the replicas. @@ -9,7 +9,7 @@ service: model: $MODEL_NAME messages: - role: user - content: Hi! What is your name? + content: Hello! What is your name? max_tokens: 1 initial_delay_seconds: 1200 # How many replicas to manage. @@ -17,8 +17,10 @@ service: resources: - accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} + accelerators: {A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8} + disk_size: 1024 disk_tier: best + memory: 32+ ports: 8000 setup: | From 60cd1600991ebedf40f0b87840d5888bf5fc181f Mon Sep 17 00:00:00 2001 From: Haijian Wang <130898843+Haijian06@users.noreply.github.com> Date: Wed, 18 Sep 2024 15:25:10 +0800 Subject: [PATCH 06/15] Update yi15-6b.yaml --- llm/yi/yi15-6b.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/yi/yi15-6b.yaml b/llm/yi/yi15-6b.yaml index be3f380ff1b..42ed8897035 100644 --- a/llm/yi/yi15-6b.yaml +++ b/llm/yi/yi15-6b.yaml @@ -9,7 +9,7 @@ service: model: $MODEL_NAME messages: - role: user - content: Hi! What is your name? + content: Hello! What is your name? max_tokens: 1 initial_delay_seconds: 1200 # How many replicas to manage. From 9be9bd9e4cdb1db6e340d375e94c4323a7725d00 Mon Sep 17 00:00:00 2001 From: Haijian Wang <130898843+Haijian06@users.noreply.github.com> Date: Wed, 18 Sep 2024 15:28:21 +0800 Subject: [PATCH 07/15] Add files via upload --- llm/yi/yicoder-1_5b.yaml | 33 +++++++++++++++++++++++++++++++++ llm/yi/yicoder-9b.yaml | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 llm/yi/yicoder-1_5b.yaml create mode 100644 llm/yi/yicoder-9b.yaml diff --git a/llm/yi/yicoder-1_5b.yaml b/llm/yi/yicoder-1_5b.yaml new file mode 100644 index 00000000000..42ed8897035 --- /dev/null +++ b/llm/yi/yicoder-1_5b.yaml @@ -0,0 +1,33 @@ +envs: + MODEL_NAME: 01-ai/Yi-1.5-6B-Chat + +service: + # Specifying the path to the endpoint to check the readiness of the replicas. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_tokens: 1 + initial_delay_seconds: 1200 + # How many replicas to manage. + replicas: 2 + + +resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} + disk_tier: best + ports: 8000 + +setup: | + pip install vllm==0.6.1.post2 + pip install vllm-flash-attn + +run: | + export PATH=$PATH:/sbin + vllm serve $MODEL_NAME \ + --host 0.0.0.0 \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + --max-model-len 1024 | tee ~/openai_api_server.log diff --git a/llm/yi/yicoder-9b.yaml b/llm/yi/yicoder-9b.yaml new file mode 100644 index 00000000000..42ed8897035 --- /dev/null +++ b/llm/yi/yicoder-9b.yaml @@ -0,0 +1,33 @@ +envs: + MODEL_NAME: 01-ai/Yi-1.5-6B-Chat + +service: + # Specifying the path to the endpoint to check the readiness of the replicas. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_tokens: 1 + initial_delay_seconds: 1200 + # How many replicas to manage. + replicas: 2 + + +resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} + disk_tier: best + ports: 8000 + +setup: | + pip install vllm==0.6.1.post2 + pip install vllm-flash-attn + +run: | + export PATH=$PATH:/sbin + vllm serve $MODEL_NAME \ + --host 0.0.0.0 \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + --max-model-len 1024 | tee ~/openai_api_server.log From a9ffe54dc4ac85d6281b88d9e01d6f675bc42ccf Mon Sep 17 00:00:00 2001 From: Haijian Wang <130898843+Haijian06@users.noreply.github.com> Date: Wed, 18 Sep 2024 15:29:10 +0800 Subject: [PATCH 08/15] Update yicoder-1_5b.yaml --- llm/yi/yicoder-1_5b.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/yi/yicoder-1_5b.yaml b/llm/yi/yicoder-1_5b.yaml index 42ed8897035..5c0d409483d 100644 --- a/llm/yi/yicoder-1_5b.yaml +++ b/llm/yi/yicoder-1_5b.yaml @@ -1,5 +1,5 @@ envs: - MODEL_NAME: 01-ai/Yi-1.5-6B-Chat + MODEL_NAME: 01-ai/Yi-Coder-1.5B-Chat service: # Specifying the path to the endpoint to check the readiness of the replicas. From 6de0cf76fb67833ab369d2f31e77e5770dd9f727 Mon Sep 17 00:00:00 2001 From: Haijian Wang <130898843+Haijian06@users.noreply.github.com> Date: Wed, 18 Sep 2024 15:30:23 +0800 Subject: [PATCH 09/15] Update yicoder-9b.yaml --- llm/yi/yicoder-9b.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llm/yi/yicoder-9b.yaml b/llm/yi/yicoder-9b.yaml index 42ed8897035..36aaea45111 100644 --- a/llm/yi/yicoder-9b.yaml +++ b/llm/yi/yicoder-9b.yaml @@ -1,5 +1,5 @@ envs: - MODEL_NAME: 01-ai/Yi-1.5-6B-Chat + MODEL_NAME: 01-ai/Yi-Coder-9B-Chat service: # Specifying the path to the endpoint to check the readiness of the replicas. @@ -17,7 +17,7 @@ service: resources: - accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} + accelerators: {L4:8, A10g:8, A10:8, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8} disk_tier: best ports: 8000 From a53e27a45c3a581993592e1b2170da97af8d689c Mon Sep 17 00:00:00 2001 From: Haijian Wang <130898843+Haijian06@users.noreply.github.com> Date: Thu, 19 Sep 2024 09:38:45 +0800 Subject: [PATCH 10/15] Add files via upload --- llm/yi/README.md | 60 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 llm/yi/README.md diff --git a/llm/yi/README.md b/llm/yi/README.md new file mode 100644 index 00000000000..76fcf6151e6 --- /dev/null +++ b/llm/yi/README.md @@ -0,0 +1,60 @@ +# Serving Yi on Your Own Kubernetes or Cloud + +🤖 The Yi series models are the next generation of open-source large language models trained from scratch by [01.AI](https://www.lingyiwanwu.com/en). + +**Update (Sep 19, 2024) -** SkyPilot now supports the [**Yi**](https://01-ai.github.io/) model(Yi-Coder Yi-1.5)! + +
+ +
+ +## Why use SkyPilot to deploy over commercial hosted solutions? + +* Get the best GPU availability by utilizing multiple resources pools across Kubernetes clusters and multiple regions/clouds. +* Pay absolute minimum — SkyPilot picks the cheapest resources across Kubernetes clusters and regions/clouds. No managed solution markups. +* Scale up to multiple replicas across different locations and accelerators, all served with a single endpoint +* Everything stays in your Kubernetes or cloud account (your VMs & buckets) +* Completely private - no one else sees your chat history + + +## Running Yi model with SkyPilot + +After [installing SkyPilot](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html), run your own Yi model on vLLM with SkyPilot in 1-click: + +1. Start serving Yi-1.5 34B on a single instance with any available GPU in the list specified in [yi15-34b.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/yi/yi15-34b.yaml) with a vLLM powered OpenAI-compatible endpoint (You can also switch to [yicoder-9b.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/yi/yicoder-9b.yaml) or [other model](https://github.com/skypilot-org/skypilot/tree/master/llm/yi) for a smaller model): + +```console +sky launch -c yi yi15-34b.yaml +``` +2. Send a request to the endpoint for completion: +```bash +ENDPOINT=$(sky status --endpoint 8000 yi) + +curl http://$ENDPOINT/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "01-ai/Yi-1.5-34B-Chat", + "prompt": "Who are you?", + "max_tokens": 512 + }' | jq -r '.choices[0].text' +``` + +3. Send a request for chat completion: +```bash +curl http://$ENDPOINT/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "01-ai/Yi-1.5-34B-Chat", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Who are you?" + } + ], + "max_tokens": 512 + }' | jq -r '.choices[0].message.content' +``` From 022fa97762c64e4ab0c8892abf564efa7b3394ee Mon Sep 17 00:00:00 2001 From: Haijian Wang <130898843+Haijian06@users.noreply.github.com> Date: Thu, 19 Sep 2024 14:39:08 +0800 Subject: [PATCH 11/15] Update yi15-34b.yaml --- llm/yi/yi15-34b.yaml | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/llm/yi/yi15-34b.yaml b/llm/yi/yi15-34b.yaml index e244cb0335d..99fe5481d7a 100644 --- a/llm/yi/yi15-34b.yaml +++ b/llm/yi/yi15-34b.yaml @@ -1,21 +1,6 @@ envs: MODEL_NAME: 01-ai/Yi-1.5-34B-Chat - -service: - # Specifying the path to the endpoint to check the readiness of the replicas. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? - max_tokens: 1 - initial_delay_seconds: 1200 - # How many replicas to manage. - replicas: 2 - resources: accelerators: {A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8} disk_size: 1024 From b746b2af42088ca1c95c74520ef67270dc96b132 Mon Sep 17 00:00:00 2001 From: Haijian Wang <130898843+Haijian06@users.noreply.github.com> Date: Thu, 19 Sep 2024 14:39:26 +0800 Subject: [PATCH 12/15] Update yi15-6b.yaml --- llm/yi/yi15-6b.yaml | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/llm/yi/yi15-6b.yaml b/llm/yi/yi15-6b.yaml index 42ed8897035..879f5ffea9c 100644 --- a/llm/yi/yi15-6b.yaml +++ b/llm/yi/yi15-6b.yaml @@ -1,21 +1,6 @@ envs: MODEL_NAME: 01-ai/Yi-1.5-6B-Chat -service: - # Specifying the path to the endpoint to check the readiness of the replicas. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? - max_tokens: 1 - initial_delay_seconds: 1200 - # How many replicas to manage. - replicas: 2 - - resources: accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} disk_tier: best From f58ec47821913f2c1c8f5a4af9b0d4aca4fdd73c Mon Sep 17 00:00:00 2001 From: Haijian Wang <130898843+Haijian06@users.noreply.github.com> Date: Thu, 19 Sep 2024 14:39:58 +0800 Subject: [PATCH 13/15] Update yi15-9b.yaml --- llm/yi/yi15-9b.yaml | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/llm/yi/yi15-9b.yaml b/llm/yi/yi15-9b.yaml index cb8901b5dfc..b7ac40b4e11 100644 --- a/llm/yi/yi15-9b.yaml +++ b/llm/yi/yi15-9b.yaml @@ -1,21 +1,6 @@ envs: MODEL_NAME: 01-ai/Yi-1.5-9B-Chat -service: - # Specifying the path to the endpoint to check the readiness of the replicas. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? - max_tokens: 1 - initial_delay_seconds: 1200 - # How many replicas to manage. - replicas: 2 - - resources: accelerators: {L4:8, A10g:8, A10:8, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8} disk_tier: best From 7cd568111e0de0d11e8fa342d72a8f0fa50cf989 Mon Sep 17 00:00:00 2001 From: Haijian Wang <130898843+Haijian06@users.noreply.github.com> Date: Thu, 19 Sep 2024 14:40:13 +0800 Subject: [PATCH 14/15] Update yicoder-1_5b.yaml --- llm/yi/yicoder-1_5b.yaml | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/llm/yi/yicoder-1_5b.yaml b/llm/yi/yicoder-1_5b.yaml index 5c0d409483d..383f88b657d 100644 --- a/llm/yi/yicoder-1_5b.yaml +++ b/llm/yi/yicoder-1_5b.yaml @@ -1,21 +1,6 @@ envs: MODEL_NAME: 01-ai/Yi-Coder-1.5B-Chat - -service: - # Specifying the path to the endpoint to check the readiness of the replicas. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? - max_tokens: 1 - initial_delay_seconds: 1200 - # How many replicas to manage. - replicas: 2 - resources: accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} disk_tier: best From 55cf8db7b0229ea13bb70e1e6975188af5103119 Mon Sep 17 00:00:00 2001 From: Haijian Wang <130898843+Haijian06@users.noreply.github.com> Date: Thu, 19 Sep 2024 14:40:28 +0800 Subject: [PATCH 15/15] Update yicoder-9b.yaml --- llm/yi/yicoder-9b.yaml | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/llm/yi/yicoder-9b.yaml b/llm/yi/yicoder-9b.yaml index 36aaea45111..28e74b45bb5 100644 --- a/llm/yi/yicoder-9b.yaml +++ b/llm/yi/yicoder-9b.yaml @@ -1,21 +1,6 @@ envs: MODEL_NAME: 01-ai/Yi-Coder-9B-Chat - -service: - # Specifying the path to the endpoint to check the readiness of the replicas. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? - max_tokens: 1 - initial_delay_seconds: 1200 - # How many replicas to manage. - replicas: 2 - resources: accelerators: {L4:8, A10g:8, A10:8, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8} disk_tier: best