|
13 | 13 | # - Server: |
14 | 14 | # |
15 | 15 | # ```bash |
| 16 | +# # Mistral format |
16 | 17 | # vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \ |
17 | 18 | # --tokenizer-mode mistral --config-format mistral --load-format mistral \ |
18 | 19 | # --limit-mm-per-prompt 'image=4' --max-model-len 16384 |
| 20 | +# |
| 21 | +# # HF format |
| 22 | +# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \ |
| 23 | +# --limit-mm-per-prompt 'image=4' --max-model-len 16384 |
19 | 24 | # ``` |
20 | 25 | # |
21 | 26 | # - Client: |
|
44 | 49 | # python demo.py simple |
45 | 50 | # python demo.py advanced |
46 | 51 |
|
| 52 | +# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs. |
| 53 | +# These scripts have been tested on 2x L40 GPUs |
| 54 | + |
47 | 55 |
|
48 | 56 | def run_simple_demo(args: argparse.Namespace): |
49 | 57 | model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" |
50 | 58 | sampling_params = SamplingParams(max_tokens=8192) |
51 | 59 |
|
52 | | - # Lower max_model_len and/or max_num_seqs on low-VRAM GPUs. |
53 | 60 | llm = LLM( |
54 | 61 | model=model_name, |
55 | | - tokenizer_mode="mistral", |
56 | | - config_format="mistral", |
57 | | - load_format="mistral", |
| 62 | + tokenizer_mode="mistral" if args.format == "mistral" else "auto", |
| 63 | + config_format="mistral" if args.format == "mistral" else "auto", |
| 64 | + load_format="mistral" if args.format == "mistral" else "auto", |
58 | 65 | max_model_len=4096, |
59 | 66 | max_num_seqs=2, |
| 67 | + tensor_parallel_size=2, |
60 | 68 | disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, |
61 | 69 | ) |
62 | 70 |
|
@@ -88,17 +96,18 @@ def run_simple_demo(args: argparse.Namespace): |
88 | 96 |
|
89 | 97 | def run_advanced_demo(args: argparse.Namespace): |
90 | 98 | model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" |
91 | | - max_img_per_msg = 5 |
| 99 | + max_img_per_msg = 3 |
92 | 100 | max_tokens_per_img = 4096 |
93 | 101 |
|
94 | 102 | sampling_params = SamplingParams(max_tokens=8192, temperature=0.7) |
95 | 103 | llm = LLM( |
96 | 104 | model=model_name, |
97 | | - tokenizer_mode="mistral", |
98 | | - config_format="mistral", |
99 | | - load_format="mistral", |
| 105 | + tokenizer_mode="mistral" if args.format == "mistral" else "auto", |
| 106 | + config_format="mistral" if args.format == "mistral" else "auto", |
| 107 | + load_format="mistral" if args.format == "mistral" else "auto", |
100 | 108 | limit_mm_per_prompt={"image": max_img_per_msg}, |
101 | 109 | max_model_len=max_img_per_msg * max_tokens_per_img, |
| 110 | + tensor_parallel_size=2, |
102 | 111 | disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, |
103 | 112 | ) |
104 | 113 |
|
@@ -166,6 +175,11 @@ def main(): |
166 | 175 | help="Specify the demo mode: 'simple' or 'advanced'", |
167 | 176 | ) |
168 | 177 |
|
| 178 | + parser.add_argument('--format', |
| 179 | + choices=["mistral", "hf"], |
| 180 | + default="mistral", |
| 181 | + help='Specify the format of the model to load.') |
| 182 | + |
169 | 183 | parser.add_argument( |
170 | 184 | '--disable-mm-preprocessor-cache', |
171 | 185 | action='store_true', |
|
0 commit comments