|
30 | 30 | "stage_ids_this_rank", |
31 | 31 | "generate_llm_fqn_per_model_part", |
32 | 32 | "pipeline_module_split", |
| 33 | + "module_split", |
33 | 34 | ] |
34 | 35 |
|
35 | 36 |
|
@@ -118,7 +119,7 @@ def stage_ids_this_rank( |
118 | 119 | stages_per_rank == 2 |
119 | 120 | ), f"v schedules assume 2 stages per rank, got {stages_per_rank}" |
120 | 121 | stage_v_pairs = list( |
121 | | - zip(range(pp_size), range(num_stages - 1, pp_size - 1, -1)) |
| 122 | + zip(range(pp_size), range(num_stages - 1, pp_size - 1, -1), strict=True) |
122 | 123 | ) |
123 | 124 | return stage_v_pairs[pp_rank] |
124 | 125 |
|
@@ -352,3 +353,96 @@ def _build_stage_from_modules( |
352 | 353 | models.append(model_chunk) |
353 | 354 |
|
354 | 355 | return stages, models |
| 356 | + |
| 357 | + |
| 358 | +def module_split( |
| 359 | + model: nn.Module, |
| 360 | + module_names_per_stage: list[list[str]], |
| 361 | +) -> list[nn.Module]: |
| 362 | + """ |
| 363 | + This API creates pipeline stages based on specified module names for each stage. |
| 364 | + This method updates the model in place. |
| 365 | +
|
| 366 | + Args: |
| 367 | + model: The complete model to be split |
| 368 | + module_names_per_stage: List of lists, where each inner list contains the module names |
| 369 | + that should be included in that stage. Module names should be |
| 370 | + dot-separated paths. Examples: |
| 371 | + - "tok_embeddings" for token embeddings |
| 372 | + - "layers.0", "layers.1" for specific transformer layers |
| 373 | + - "norm" for the final normalization layer |
| 374 | + - "output" for the output projection layer |
| 375 | +
|
| 376 | + Returns: |
| 377 | + List of model chunks |
| 378 | +
|
| 379 | + Example usage: |
| 380 | + module_names_per_stage = [ |
| 381 | + ["tok_embeddings", "layers.0"], # Stage 0: embeddings + first layer |
| 382 | + ["layers.1", "layers.2"], # Stage 1: middle layers |
| 383 | + ["norm", "output"] # Stage 2: final norm + output |
| 384 | + ] |
| 385 | + """ |
| 386 | + |
| 387 | + def _build_stage_from_modules(stage_idx: int, module_names: list[str]) -> nn.Module: |
| 388 | + stage_model = nn.Module() |
| 389 | + # Create a set of modules to keep for faster lookup |
| 390 | + modules_to_keep = set(module_names) |
| 391 | + print(f"Stage {stage_idx}: Modules to keep: {modules_to_keep}") |
| 392 | + for module_name, module_value in model.named_children(): |
| 393 | + # Handle layer-like structures (e.g., "layers.0", "layers.1") |
| 394 | + if isinstance(module_value, (nn.ModuleDict, nn.ModuleList)): |
| 395 | + layers_to_keep = { |
| 396 | + name.split(".", 1)[1] |
| 397 | + for name in modules_to_keep |
| 398 | + if name.startswith(f"{module_name}.") |
| 399 | + } |
| 400 | + |
| 401 | + if not layers_to_keep: |
| 402 | + continue |
| 403 | + |
| 404 | + # Keep only specified layers |
| 405 | + if isinstance(module_value, nn.ModuleDict): |
| 406 | + for layer_name in list(module_value.keys()): |
| 407 | + if layer_name in layers_to_keep: |
| 408 | + setattr( |
| 409 | + stage_model, |
| 410 | + f"{module_name}.{layer_name}", |
| 411 | + module_value[layer_name], |
| 412 | + ) |
| 413 | + else: |
| 414 | + indices_to_keep = { |
| 415 | + int(idx) for idx in layers_to_keep if idx.isdigit() |
| 416 | + } |
| 417 | + new_layers = nn.ModuleList( |
| 418 | + [ |
| 419 | + layer |
| 420 | + for i, layer in enumerate(module_value) |
| 421 | + if i in indices_to_keep |
| 422 | + ] |
| 423 | + ) |
| 424 | + setattr(stage_model, module_name, new_layers) |
| 425 | + |
| 426 | + continue |
| 427 | + |
| 428 | + # Handle simple module attributes (e.g., "linear", "norm") |
| 429 | + if module_name not in modules_to_keep: |
| 430 | + continue |
| 431 | + |
| 432 | + setattr(stage_model, module_name, module_value) |
| 433 | + |
| 434 | + return stage_model |
| 435 | + |
| 436 | + num_stages = len(module_names_per_stage) |
| 437 | + models = [] |
| 438 | + |
| 439 | + for stage_idx in range(num_stages): |
| 440 | + module_names = module_names_per_stage[stage_idx] |
| 441 | + model_chunk = _build_stage_from_modules( |
| 442 | + stage_idx, |
| 443 | + module_names, |
| 444 | + ) |
| 445 | + logger.info(f"building stage_idx {stage_idx} " f"with modules {module_names}") |
| 446 | + models.append(model_chunk) |
| 447 | + |
| 448 | + return models |
0 commit comments