|
8 | 8 | os.environ["VLLM_USE_V1"] = "1" |
9 | 9 |
|
10 | 10 | # Sample prompts. |
11 | | -prompts = [ |
12 | | - "Hello, my name is", |
13 | | - "The president of the United States is", |
14 | | - "The capital of France is", |
15 | | - "The future of AI is", |
16 | | -] * 10 |
| 11 | +prompts = ["The president of the United States is"] * 41 |
17 | 12 | # Create a sampling params object. |
18 | | -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) |
| 13 | +sampling_params = SamplingParams(max_tokens=100, temperature=0.0) |
19 | 14 |
|
20 | 15 |
|
21 | 16 | def main(): |
22 | 17 | # Create an LLM. |
23 | 18 | llm = LLM( |
24 | | - model="deepseek-ai/DeepSeek-V2-Lite", |
| 19 | + model="deepseek-ai/DeepSeek-V3-Lite-base-latest-w8a8-dynamic", |
25 | 20 | hf_overrides={ |
26 | 21 | "architectures": ["DeepseekDBOForCausalLM"], |
27 | 22 | }, # override the model arch to run the dbo model |
28 | 23 | enforce_eager=True, |
29 | | - tensor_parallel_size=8, |
30 | | - max_num_seqs=16, |
31 | | - max_model_len=8192, |
32 | | - max_num_batched_tokens=32768, |
33 | | - block_size=128, |
34 | | - compilation_config=1, |
35 | | - gpu_memory_utilization=0.96) |
| 24 | + tensor_parallel_size=2, |
| 25 | + max_model_len=4096, |
| 26 | + trust_remote_code=True, |
| 27 | + additional_config={ |
| 28 | + "torchair_graph_config": { |
| 29 | + "enabled": False |
| 30 | + }, |
| 31 | + "ascend_scheduler_config": { |
| 32 | + "enabled": True |
| 33 | + }, |
| 34 | + "expert_tensor_parallel_size": 1 |
| 35 | + }) |
36 | 36 |
|
37 | 37 | # Generate texts from the prompts. The output is a list of RequestOutput |
38 | 38 | # objects that contain the prompt, generated text, and other information. |
|
0 commit comments