|
124 | 124 | max_model_len=8192, |
125 | 125 | ), |
126 | 126 | placement_group_config=dict( |
127 | | - bundles=[{"GPU": 1, "CPU": 2}] * 2, |
| 127 | + bundles=[{"GPU": 1}] * 2, |
128 | 128 | strategy="PACK", |
129 | 129 | ), |
130 | 130 | ) |
|
158 | 158 | max_model_len=8192, |
159 | 159 | ), |
160 | 160 | placement_group_config=dict( |
161 | | - bundles=[{"GPU": 1, "CPU": 2}] * 4, |
| 161 | + bundles=[{"GPU": 1}] * 4, |
162 | 162 | strategy="SPREAD", |
163 | 163 | ), |
164 | 164 | ) |
|
192 | 192 | max_model_len=8192, |
193 | 193 | ), |
194 | 194 | placement_group_config=dict( |
195 | | - bundles=[{"GPU": 1, "CPU": 2}] * 2, |
| 195 | + bundles=[{"GPU": 1}] * 2, |
196 | 196 | strategy="STRICT_PACK", |
197 | 197 | ), |
198 | 198 | ) |
|
201 | 201 | app = build_openai_app({"llm_configs": [llm_config]}) |
202 | 202 | serve.run(app, blocking=True) |
203 | 203 | # __custom_placement_group_strict_pack_example_end__ |
204 | | - |
205 | | -# __yaml_cross_node_tp_pp_example_start__ |
206 | | -# config.yaml |
207 | | -# applications: |
208 | | -# - args: |
209 | | -# llm_configs: |
210 | | -# - model_loading_config: |
211 | | -# model_id: llama-3.1-8b |
212 | | -# model_source: meta-llama/Llama-3.1-8B-Instruct |
213 | | -# accelerator_type: L4 |
214 | | -# deployment_config: |
215 | | -# autoscaling_config: |
216 | | -# min_replicas: 1 |
217 | | -# max_replicas: 1 |
218 | | -# engine_kwargs: |
219 | | -# tensor_parallel_size: 2 |
220 | | -# pipeline_parallel_size: 2 |
221 | | -# distributed_executor_backend: ray |
222 | | -# max_model_len: 8192 |
223 | | -# enable_chunked_prefill: true |
224 | | -# max_num_batched_tokens: 4096 |
225 | | -# import_path: ray.serve.llm:build_openai_app |
226 | | -# name: llm_app |
227 | | -# route_prefix: "/" |
228 | | -# __yaml_cross_node_tp_pp_example_end__ |
229 | | - |
230 | | -# __yaml_custom_placement_group_example_start__ |
231 | | -# config.yaml |
232 | | -# applications: |
233 | | -# - args: |
234 | | -# llm_configs: |
235 | | -# - model_loading_config: |
236 | | -# model_id: llama-3.1-8b |
237 | | -# model_source: meta-llama/Llama-3.1-8B-Instruct |
238 | | -# accelerator_type: L4 |
239 | | -# deployment_config: |
240 | | -# autoscaling_config: |
241 | | -# min_replicas: 1 |
242 | | -# max_replicas: 1 |
243 | | -# engine_kwargs: |
244 | | -# tensor_parallel_size: 4 |
245 | | -# distributed_executor_backend: ray |
246 | | -# max_model_len: 8192 |
247 | | -# placement_group_config: |
248 | | -# bundles: |
249 | | -# - GPU: 1 |
250 | | -# CPU: 2 |
251 | | -# - GPU: 1 |
252 | | -# CPU: 2 |
253 | | -# - GPU: 1 |
254 | | -# CPU: 2 |
255 | | -# - GPU: 1 |
256 | | -# CPU: 2 |
257 | | -# strategy: SPREAD |
258 | | -# import_path: ray.serve.llm:build_openai_app |
259 | | -# name: llm_app |
260 | | -# route_prefix: "/" |
261 | | -# __yaml_custom_placement_group_example_end__ |
0 commit comments