|
1 | 1 | # Single Node (Atlas 300I series) |
2 | 2 |
|
3 | 3 | ```{note} |
4 | | -This Atlas 300I series is currently experimental. In future versions, there may be behavioral changes around model coverage, performance improvement. |
| 4 | +1. This Atlas 300I series is currently experimental. In future versions, there may be behavioral changes around model coverage, performance improvement. |
| 5 | +2. Currently, the 310I series only supports eager mode and the data type is float16. |
5 | 6 | ``` |
6 | 7 |
|
7 | 8 | ## Run vLLM on Altlas 300I series |
@@ -83,7 +84,7 @@ curl http://localhost:8000/v1/completions \ |
83 | 84 |
|
84 | 85 | :::: |
85 | 86 |
|
86 | | -::::{tab-item} Qwen/Qwen2.5-7B-Instruct |
| 87 | +::::{tab-item} Qwen2.5-7B-Instruct |
87 | 88 | :sync: qwen7b |
88 | 89 |
|
89 | 90 | Run the following command to start the vLLM server: |
@@ -113,6 +114,36 @@ curl http://localhost:8000/v1/completions \ |
113 | 114 |
|
114 | 115 | :::: |
115 | 116 |
|
| 117 | +::::{tab-item} Qwen2.5-VL-3B-Instruct |
| 118 | +:sync: qwen-vl-2.5-3b |
| 119 | + |
| 120 | +Run the following command to start the vLLM server: |
| 121 | + |
| 122 | +```{code-block} bash |
| 123 | + :substitutions: |
| 124 | +vllm serve Qwen/Qwen2.5-VL-3B-Instruct \ |
| 125 | + --tensor-parallel-size 1 \ |
| 126 | + --enforce-eager \ |
| 127 | + --dtype float16 \ |
| 128 | + --compilation-config '{"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}' |
| 129 | +``` |
| 130 | + |
| 131 | +Once your server is started, you can query the model with input prompts |
| 132 | + |
| 133 | +```bash |
| 134 | +curl http://localhost:8000/v1/completions \ |
| 135 | + -H "Content-Type: application/json" \ |
| 136 | + -d '{ |
| 137 | + "prompt": "The future of AI is", |
| 138 | + "max_tokens": 64, |
| 139 | + "top_p": 0.95, |
| 140 | + "top_k": 50, |
| 141 | + "temperature": 0.6 |
| 142 | + }' |
| 143 | +``` |
| 144 | + |
| 145 | +:::: |
| 146 | + |
116 | 147 | ::::{tab-item} Pangu-Pro-MoE-72B |
117 | 148 | :sync: pangu |
118 | 149 |
|
@@ -251,6 +282,49 @@ clean_up() |
251 | 282 |
|
252 | 283 | :::: |
253 | 284 |
|
| 285 | +::::{tab-item} Qwen2.5-VL-3B-Instruct |
| 286 | +:sync: qwen-vl-2.5-3b |
| 287 | + |
| 288 | +```{code-block} python |
| 289 | + :substitutions: |
| 290 | +from vllm import LLM, SamplingParams |
| 291 | +import gc |
| 292 | +import torch |
| 293 | +from vllm import LLM, SamplingParams |
| 294 | +from vllm.distributed.parallel_state import (destroy_distributed_environment, |
| 295 | + destroy_model_parallel) |
| 296 | +
|
| 297 | +def clean_up(): |
| 298 | + destroy_model_parallel() |
| 299 | + destroy_distributed_environment() |
| 300 | + gc.collect() |
| 301 | + torch.npu.empty_cache() |
| 302 | +prompts = [ |
| 303 | + "Hello, my name is", |
| 304 | + "The future of AI is", |
| 305 | +] |
| 306 | +# Create a sampling params object. |
| 307 | +sampling_params = SamplingParams(max_tokens=100, top_p=0.95, top_k=50, temperature=0.6) |
| 308 | +# Create an LLM. |
| 309 | +llm = LLM( |
| 310 | + model="Qwen/Qwen2.5-VL-3B-Instruct", |
| 311 | + tensor_parallel_size=1, |
| 312 | + enforce_eager=True, # For 300I series, only eager mode is supported. |
| 313 | + dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 300I series |
| 314 | + compilation_config={"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}, # High performance for 300I series |
| 315 | +) |
| 316 | +# Generate texts from the prompts. |
| 317 | +outputs = llm.generate(prompts, sampling_params) |
| 318 | +for output in outputs: |
| 319 | + prompt = output.prompt |
| 320 | + generated_text = output.outputs[0].text |
| 321 | + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") |
| 322 | +del llm |
| 323 | +clean_up() |
| 324 | +``` |
| 325 | + |
| 326 | +:::: |
| 327 | + |
254 | 328 | ::::{tab-item} Pangu-Pro-MoE-72B |
255 | 329 | :sync: pangu |
256 | 330 |
|
|
0 commit comments