|
9 | 9 | ray = None |
10 | 10 |
|
11 | 11 | from cacheflow.master.scheduler import Scheduler |
| 12 | +from cacheflow.master.simple_frontend import SimpleFrontend |
12 | 13 | from cacheflow.models import get_memory_analyzer |
13 | 14 | from cacheflow.worker.controller import Controller, DeviceID |
14 | 15 | from cacheflow.sequence import SequenceGroup |
15 | 16 | from cacheflow.sampling_params import SamplingParams |
| 17 | +from cacheflow.utils import get_gpu_memory, get_cpu_memory |
16 | 18 |
|
17 | 19 |
|
18 | 20 | class Server: |
19 | 21 | def __init__( |
20 | 22 | self, |
21 | 23 | model: str, |
22 | | - model_path: str, |
| 24 | + cache_dir: Optional[str], |
23 | 25 | use_dummy_weights: bool, |
| 26 | + use_np_cache: bool, |
24 | 27 | pipeline_parallel_size: int, |
25 | 28 | tensor_parallel_size: int, |
26 | 29 | block_size: int, |
@@ -78,8 +81,9 @@ def __init__( |
78 | 81 | num_cpu_blocks=self.num_cpu_blocks, |
79 | 82 | dtype=dtype, |
80 | 83 | seed=seed, |
81 | | - model_path=model_path, |
| 84 | + cache_dir=cache_dir, |
82 | 85 | use_dummy_weights=use_dummy_weights, |
| 86 | + use_np_cache=use_np_cache, |
83 | 87 | max_num_batched_tokens=max_num_batched_tokens, |
84 | 88 | use_ray=use_ray, |
85 | 89 | ) |
@@ -203,25 +207,72 @@ def initialize_cluster( |
203 | 207 | def add_server_arguments(parser: argparse.ArgumentParser): |
204 | 208 | # Model arguments |
205 | 209 | parser.add_argument('--model', type=str, default='facebook/opt-125m', help='model name') |
206 | | - parser.add_argument('--model-path', type=str, default='~/.cacheflow/model_weights', |
207 | | - help='model path to download and load the weights') |
| 210 | + parser.add_argument('--cache-dir', type=str, default=None, |
| 211 | + help='cache dir to download and load the weights, ' |
| 212 | + 'default to the default cache dir of huggingface') |
| 213 | + parser.add_argument('--use-np-cache', action='store_true', |
| 214 | + help='save a numpy copy of model weights for faster loading') |
| 215 | + parser.add_argument('--use-dummy-weights', action='store_true', help='use dummy values for model weights') |
| 216 | + # NOTE(woosuk): If FlashAttention is used, the float data type is not supported. |
| 217 | + parser.add_argument('--dtype', type=str, default='half', choices=['half'], help='data type') |
208 | 218 | # Parallel arguments |
209 | 219 | parser.add_argument('--use-ray', action='store_true', help='use Ray for distributed serving, will be automatically set when using more than 1 GPU') |
210 | 220 | parser.add_argument('--pipeline-parallel-size', '-pp', type=int, default=1, help='number of pipeline stages') |
211 | 221 | parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1, help='number of tensor parallel replicas') |
212 | 222 | # KV cache arguments |
213 | 223 | parser.add_argument('--block-size', type=int, default=16, choices=[1, 2, 4, 8, 16, 32, 64, 128, 256], help='token block size') |
214 | | - # NOTE(woosuk): If FlashAttention is used, the float data type is not supported. |
215 | | - parser.add_argument('--dtype', type=str, default='half', choices=['half'], help='data type') |
216 | 224 | # TODO(woosuk): Support fine-grained seeds (e.g., seed per request). |
217 | 225 | parser.add_argument('--seed', type=int, default=0, help='random seed') |
218 | 226 | parser.add_argument('--swap-space', type=int, default=20, help='CPU swap space size (GiB) per GPU') |
219 | 227 | parser.add_argument('--max-num-batched-tokens', type=int, default=2560, help='maximum number of batched tokens per iteration') |
220 | 228 | parser.add_argument('--max-num-sequences', type=int, default=256, help='maximum number of sequences per iteration') |
221 | | - parser.add_argument('--use-dummy-weights', action='store_true', help='use dummy values for model weights') |
222 | 229 | return parser |
223 | 230 |
|
| 231 | + |
224 | 232 | def process_server_arguments(args: argparse.Namespace): |
225 | 233 | if args.pipeline_parallel_size * args.tensor_parallel_size > 1: |
226 | 234 | args.use_ray = True |
227 | 235 | return args |
| 236 | + |
| 237 | + |
| 238 | +def init_local_server_and_frontend_with_arguments(args: argparse.Namespace): |
| 239 | + # TODO(zhuohan): Support pipeline parallelism. |
| 240 | + assert args.pipeline_parallel_size == 1, ( |
| 241 | + 'Pipeline parallelism is not supported yet.') |
| 242 | + |
| 243 | + (num_nodes, num_devices_per_node, distributed_init_method, |
| 244 | + all_stage_devices) = ( |
| 245 | + initialize_cluster( |
| 246 | + use_ray=args.use_ray, |
| 247 | + pipeline_parallel_size=args.pipeline_parallel_size, |
| 248 | + tensor_parallel_size=args.tensor_parallel_size)) |
| 249 | + |
| 250 | + # Create a server. |
| 251 | + server = Server( |
| 252 | + model=args.model, |
| 253 | + cache_dir=args.cache_dir, |
| 254 | + use_dummy_weights=args.use_dummy_weights, |
| 255 | + use_np_cache=args.use_np_cache, |
| 256 | + pipeline_parallel_size=args.pipeline_parallel_size, |
| 257 | + tensor_parallel_size=args.tensor_parallel_size, |
| 258 | + block_size=args.block_size, |
| 259 | + dtype=args.dtype, |
| 260 | + seed=args.seed, |
| 261 | + swap_space=args.swap_space, |
| 262 | + max_num_batched_tokens=args.max_num_batched_tokens, |
| 263 | + max_num_sequences=args.max_num_sequences, |
| 264 | + num_nodes=num_nodes, |
| 265 | + num_devices_per_node=num_devices_per_node, |
| 266 | + distributed_init_method=distributed_init_method, |
| 267 | + all_stage_devices=all_stage_devices, |
| 268 | + gpu_memory=get_gpu_memory(), |
| 269 | + cpu_memory=get_cpu_memory(), |
| 270 | + use_ray=args.use_ray, |
| 271 | + ) |
| 272 | + |
| 273 | + # Create a frontend. |
| 274 | + frontend = SimpleFrontend( |
| 275 | + model_name=args.model, |
| 276 | + block_size=args.block_size, |
| 277 | + ) |
| 278 | + return server, frontend |
0 commit comments