From 365345327fe6bbb04e56d85de18b84dadfcbd9fe Mon Sep 17 00:00:00 2001 From: Hossein Kavianihamedani Date: Wed, 8 Oct 2025 10:56:34 -0700 Subject: [PATCH 1/7] Submitting an interactive notebook to run SFT --- apps/sft_v2/README_NOTEBOOK.md | 435 ++++++++++++++++++ apps/sft_v2/notebook_utils.py | 463 +++++++++++++++++++ apps/sft_v2/sft_training_notebook.ipynb | 568 ++++++++++++++++++++++++ 3 files changed, 1466 insertions(+) create mode 100644 apps/sft_v2/README_NOTEBOOK.md create mode 100644 apps/sft_v2/notebook_utils.py create mode 100644 apps/sft_v2/sft_training_notebook.ipynb diff --git a/apps/sft_v2/README_NOTEBOOK.md b/apps/sft_v2/README_NOTEBOOK.md new file mode 100644 index 000000000..eb70a29ea --- /dev/null +++ b/apps/sft_v2/README_NOTEBOOK.md @@ -0,0 +1,435 @@ +# ๐Ÿš€ SFT Training Notebook Guide + +This directory contains an interactive Jupyter notebook experience for training Language Models with Supervised Fine-Tuning (SFT). + +## ๐Ÿ“ Files + +### Core Files +- **`sft_training_notebook.ipynb`** - Main Jupyter notebook for interactive training +- **`notebook_utils.py`** - Utility functions for notebook-based training +- **`main.py`** - Original command-line training script (unchanged) + +### Configuration Files +- **`llama3_8b.yaml`** - Original single-node config +- **`llama3_8b_single_node.yaml`** - Single-node config without provisioner +- **`llama3_8b_slurm_multinode.yaml`** - Multi-node config with SLURM +- **`llama3_8b_local.yaml`** - Local testing config + +## ๐ŸŽฏ Quick Start + +### 1. Open the Notebook + +```bash +cd /home/hosseinkh/forge +jupyter notebook apps/sft_v2/sft_training_notebook.ipynb +``` + +Or in VS Code: +- Open `apps/sft_v2/sft_training_notebook.ipynb` +- Select Python kernel +- Run cells sequentially + +### 2. Configure Training + +The notebook is organized into sections: + +1. **๐Ÿ“ฆ Model Configuration** - Choose model and path +2. **โš™๏ธ Training Configuration** - Set hyperparameters +3. **๐Ÿ”ง Optimizer Configuration** - Configure optimizer and LR scheduler +4. **๐Ÿ”€ Parallelism Configuration** - Set distributed training strategy +5. **๐Ÿ’พ Checkpoint Configuration** - Configure checkpointing +6. **๐Ÿ–ฅ๏ธ Resource Configuration** - Set number of GPUs/nodes +7. **โ˜๏ธ Provisioner Configuration** (optional) - For multi-node SLURM + +### 3. Run Training + +Execute the "Run Training!" cell to start training with your configuration. + +## ๐Ÿ“š Using the Utility Library + +The `notebook_utils.py` module provides a clean API for training: + +### Configuration Builders + +```python +from apps.sft_v2 import notebook_utils as nb + +# Create model config +model_config = nb.create_model_config( + name="llama3", + flavor="8B", + hf_assets_path="/path/to/model" +) + +# Create training config +training_config = nb.create_training_config( + steps=1000, + local_batch_size=1, + seq_len=2048 +) + +# Create optimizer config +optimizer_config = nb.create_optimizer_config( + name="AdamW", + lr=1e-5 +) + +# ... configure other components + +# Build complete config +config = nb.build_config( + model_config=model_config, + training_config=training_config, + optimizer_config=optimizer_config, + # ... other configs +) +``` + +### Training Functions + +```python +# Simple: run everything +nb.train(config) + +# Advanced: step-by-step control +import asyncio + +async def custom_training(): + # Initialize + await nb.initialize_provisioner(config) + + # Create and setup + recipe = await nb.create_recipe(config) + await nb.setup_recipe(recipe) + + # Train + await nb.train_recipe(recipe) + + # Cleanup + await nb.cleanup_recipe(recipe) + await nb.shutdown_provisioner(config) + +asyncio.run(custom_training()) +``` + +### Display Utilities + +```python +# Print summary +nb.summarize_config(config) + +# Print full YAML +nb.print_config(config, title="My Config") +``` + +## ๐Ÿ”ง Configuration Functions Reference + +### Model Configuration + +```python +nb.create_model_config( + name: str = "llama3", + flavor: str = "8B", + hf_assets_path: str = "/tmp/Meta-Llama-3.1-8B-Instruct" +) +``` + +### Training Configuration + +```python +nb.create_training_config( + local_batch_size: int = 1, + seq_len: int = 2048, + max_norm: float = 1.0, + steps: int = 1000, + dataset: str = "c4", + compile: bool = False +) +``` + +### Optimizer Configuration + +```python +nb.create_optimizer_config( + name: str = "AdamW", + lr: float = 1e-5, + eps: float = 1e-8, + weight_decay: float = 0.0, + betas: tuple = (0.9, 0.999) +) +``` + +### LR Scheduler Configuration + +```python +nb.create_lr_scheduler_config( + warmup_steps: int = 200, + decay_steps: Optional[int] = None, + min_lr: float = 0.0 +) +``` + +### Parallelism Configuration + +```python +nb.create_parallelism_config( + data_parallel_replicate_degree: int = 1, + data_parallel_shard_degree: int = -1, # -1 = auto (FSDP) + tensor_parallel_degree: int = 1, + pipeline_parallel_degree: int = 1, + context_parallel_degree: int = 1, + expert_parallel_degree: int = 1, + disable_loss_parallel: bool = False +) +``` + +### Checkpoint Configuration + +```python +nb.create_checkpoint_config( + enable: bool = True, + folder: str = "/tmp/checkpoints", + initial_load_path: Optional[str] = None, + initial_load_in_hf: bool = True, + last_save_in_hf: bool = True, + interval: int = 500, + async_mode: str = "disabled" +) +``` + +### Activation Checkpoint Configuration + +```python +nb.create_activation_checkpoint_config( + mode: str = "selective", # 'selective', 'full', 'none' + selective_ac_option: str = "op" +) +``` + +### Process Configuration + +```python +# Single node +nb.create_process_config( + procs: int = 8, + with_gpus: bool = True, + hosts: Optional[int] = None +) + +# Multi-node +nb.create_process_config( + procs: int = 8, + with_gpus: bool = True, + hosts: int = 4 # 4 nodes +) +``` + +### Provisioner Configuration (Multi-Node Only) + +```python +nb.create_provisioner_config( + launcher: str = "slurm", + job_name: str = "sft_training", + partition: Optional[str] = None, + time: Optional[str] = None, + account: Optional[str] = None +) +``` + +## ๐Ÿ“– Example Configurations + +### Quick Test (Single GPU, 10 steps) + +```python +model_config = nb.create_model_config( + name="llama3", + flavor="8B", + hf_assets_path="/path/to/model" +) + +training_config = nb.create_training_config( + steps=10, + local_batch_size=1 +) + +process_config = nb.create_process_config(procs=1) + +# ... configure other components with defaults +``` + +### Single Node, 8 GPUs, FSDP + +```python +parallelism_config = nb.create_parallelism_config( + data_parallel_shard_degree=-1 # Use all 8 GPUs with FSDP +) + +process_config = nb.create_process_config(procs=8) + +# No provisioner needed +provisioner_config = None +``` + +### Multi-Node, 4ร—8 GPUs, Tensor Parallel + +```python +parallelism_config = nb.create_parallelism_config( + data_parallel_shard_degree=16, # 32 GPUs / 2 TP = 16 FSDP + tensor_parallel_degree=2 +) + +process_config = nb.create_process_config( + procs=8, + hosts=4 +) + +provisioner_config = nb.create_provisioner_config( + launcher="slurm", + job_name="sft_multinode", + partition="gpu_partition", + time="24:00:00" +) +``` + +## ๐ŸŽ“ Advanced Usage + +### Custom Training Loop + +You can modify the training loop by creating your own recipe class: + +```python +from apps.sft_v2.main import ForgeSFTRecipe + +class CustomRecipe(ForgeSFTRecipe): + async def train(self): + # Custom training logic + dataloader = iter(self.train_dataloader) + + for step in range(self.num_training_steps): + batch = next(dataloader) + # Custom batch processing + self.train_step(batch) +``` + +### Experiment Tracking + +Integrate with your favorite tracking tool: + +```python +import wandb + +# Initialize tracking +wandb.init(project="sft-training", config=config) + +# Train +nb.train(config) + +# Log results +wandb.log({"final_step": config.training.steps}) +``` + +### Config Variations + +Generate multiple configs for hyperparameter sweeps: + +```python +learning_rates = [1e-5, 5e-5, 1e-4] +configs = [] + +for lr in learning_rates: + optimizer_config = nb.create_optimizer_config(lr=lr) + config = nb.build_config( + # ... other configs + optimizer_config=optimizer_config + ) + configs.append(config) + +# Train all configs +for config in configs: + nb.train(config) +``` + +## ๐Ÿ” Debugging Tips + +### Start Simple + +1. **Use 1 GPU first**: + ```python + process_config = nb.create_process_config(procs=1) + ``` + +2. **Run few steps**: + ```python + training_config = nb.create_training_config(steps=10) + ``` + +3. **Disable compilation**: + ```python + training_config = nb.create_training_config(compile=False) + ``` + +### Common Issues + +**Memory Errors:** +- Reduce batch size or sequence length +- Enable FSDP: `data_parallel_shard_degree=-1` +- Enable activation checkpointing: `mode="selective"` or `"full"` + +**Slow Training:** +- Increase batch size if memory allows +- Enable compilation: `compile=True` +- Use tensor parallelism for large models + +**Actor Timeout Errors:** +- Make sure you're not using provisioner config on single node +- Check SLURM availability with `sinfo` +- See `TROUBLESHOOTING_MULTINODE.md` for details + +## ๐Ÿ“ฆ Saving and Loading Configs + +### Save Config + +```python +from omegaconf import OmegaConf + +config_path = "my_config.yaml" +with open(config_path, 'w') as f: + OmegaConf.save(config, f) +``` + +### Load Config + +```python +from omegaconf import OmegaConf + +config = OmegaConf.load("my_config.yaml") +nb.train(config) +``` + +## ๐Ÿš€ Next Steps + +1. **Start with the notebook**: Open `sft_training_notebook.ipynb` and follow along +2. **Try a test run**: Configure for 10 steps with 1 GPU +3. **Scale up**: Increase to 8 GPUs with FSDP +4. **Go multi-node**: Configure SLURM provisioner for cluster training + +## ๐Ÿ“š Additional Resources + +- **`MULTINODE_SFT_V2_GUIDE.md`** - Detailed guide on multi-node training +- **`TROUBLESHOOTING_MULTINODE.md`** - Troubleshooting guide for multi-node issues +- **`main.py`** - Original implementation for reference + +## ๐Ÿค Contributing + +To add new configuration options: + +1. Add a `create_*_config()` function in `notebook_utils.py` +2. Update `build_config()` to include the new config +3. Add a new cell in the notebook to configure it +4. Update this README + +## โš–๏ธ License + +Copyright (c) Meta Platforms, Inc. and affiliates. + +Licensed under the BSD-style license found in the LICENSE file. diff --git a/apps/sft_v2/notebook_utils.py b/apps/sft_v2/notebook_utils.py new file mode 100644 index 000000000..b3636fd26 --- /dev/null +++ b/apps/sft_v2/notebook_utils.py @@ -0,0 +1,463 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Utility functions for notebook-based SFT training. +This module provides a clean API for interactive training in Jupyter notebooks. +""" + +import asyncio +import logging +from typing import Any, Dict, Optional + +import torch + +from apps.sft_v2.main import ForgeSFTRecipe +from omegaconf import DictConfig, OmegaConf + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +# ============================================================================ +# Configuration Builders +# ============================================================================ + + +def create_model_config( + name: str = "llama3", + flavor: str = "8B", + hf_assets_path: str = "/tmp/Meta-Llama-3.1-8B-Instruct", +) -> Dict[str, Any]: + """ + Create model configuration. + + Args: + name: Model architecture name (e.g., 'llama3', 'llama2') + flavor: Model size (e.g., '8B', '70B') + hf_assets_path: Path to HuggingFace model assets + + Returns: + Dictionary with model configuration + """ + return { + "name": name, + "flavor": flavor, + "hf_assets_path": hf_assets_path, + } + + +def create_optimizer_config( + name: str = "AdamW", + lr: float = 1e-5, + eps: float = 1e-8, + weight_decay: float = 0.0, + betas: tuple = (0.9, 0.999), +) -> Dict[str, Any]: + """ + Create optimizer configuration. + + Args: + name: Optimizer name (e.g., 'AdamW', 'Adam', 'SGD') + lr: Learning rate + eps: Epsilon for numerical stability + weight_decay: L2 regularization coefficient + betas: Coefficients for computing running averages + + Returns: + Dictionary with optimizer configuration + """ + return { + "name": name, + "lr": lr, + "eps": eps, + "weight_decay": weight_decay, + "betas": list(betas), + } + + +def create_lr_scheduler_config( + warmup_steps: int = 200, + decay_steps: Optional[int] = None, + min_lr: float = 0.0, +) -> Dict[str, Any]: + """ + Create learning rate scheduler configuration. + + Args: + warmup_steps: Number of warmup steps + decay_steps: Number of decay steps (None = no decay) + min_lr: Minimum learning rate + + Returns: + Dictionary with LR scheduler configuration + """ + config = {"warmup_steps": warmup_steps} + if decay_steps is not None: + config["decay_steps"] = decay_steps + if min_lr > 0: + config["min_lr"] = min_lr + return config + + +def create_training_config( + local_batch_size: int = 1, + seq_len: int = 2048, + max_norm: float = 1.0, + steps: int = 1000, + dataset: str = "c4", + compile: bool = False, +) -> Dict[str, Any]: + """ + Create training configuration. + + Args: + local_batch_size: Batch size per GPU + seq_len: Sequence length + max_norm: Gradient clipping max norm + steps: Total training steps + dataset: Dataset name + compile: Whether to use torch.compile + + Returns: + Dictionary with training configuration + """ + return { + "local_batch_size": local_batch_size, + "seq_len": seq_len, + "max_norm": max_norm, + "steps": steps, + "dataset": dataset, + "compile": compile, + } + + +def create_parallelism_config( + data_parallel_replicate_degree: int = 1, + data_parallel_shard_degree: int = -1, + tensor_parallel_degree: int = 1, + pipeline_parallel_degree: int = 1, + context_parallel_degree: int = 1, + expert_parallel_degree: int = 1, + disable_loss_parallel: bool = False, +) -> Dict[str, Any]: + """ + Create parallelism configuration. + + Args: + data_parallel_replicate_degree: Data parallel replication + data_parallel_shard_degree: Data parallel sharding (FSDP), -1 = auto + tensor_parallel_degree: Tensor parallelism degree + pipeline_parallel_degree: Pipeline parallelism degree + context_parallel_degree: Context parallelism degree + expert_parallel_degree: Expert parallelism degree (for MoE) + disable_loss_parallel: Whether to disable loss parallelism + + Returns: + Dictionary with parallelism configuration + """ + return { + "data_parallel_replicate_degree": data_parallel_replicate_degree, + "data_parallel_shard_degree": data_parallel_shard_degree, + "tensor_parallel_degree": tensor_parallel_degree, + "pipeline_parallel_degree": pipeline_parallel_degree, + "context_parallel_degree": context_parallel_degree, + "expert_parallel_degree": expert_parallel_degree, + "disable_loss_parallel": disable_loss_parallel, + } + + +def create_checkpoint_config( + enable: bool = True, + folder: str = "/tmp/checkpoints", + initial_load_path: Optional[str] = None, + initial_load_in_hf: bool = True, + last_save_in_hf: bool = True, + interval: int = 500, + async_mode: str = "disabled", +) -> Dict[str, Any]: + """ + Create checkpoint configuration. + + Args: + enable: Whether to enable checkpointing + folder: Path to save checkpoints + initial_load_path: Path to load initial checkpoint from + initial_load_in_hf: Load initial checkpoint in HF format + last_save_in_hf: Save last checkpoint in HF format + interval: Steps between checkpoints + async_mode: Async checkpoint mode ('disabled', 'async', etc.) + + Returns: + Dictionary with checkpoint configuration + """ + return { + "enable": enable, + "folder": folder, + "initial_load_path": initial_load_path, + "initial_load_in_hf": initial_load_in_hf, + "last_save_in_hf": last_save_in_hf, + "interval": interval, + "async_mode": async_mode, + } + + +def create_activation_checkpoint_config( + mode: str = "selective", + selective_ac_option: str = "op", +) -> Dict[str, Any]: + """ + Create activation checkpointing configuration. + + Args: + mode: Activation checkpoint mode ('selective', 'full', 'none') + selective_ac_option: Selective AC option ('op', 'layer', etc.) + + Returns: + Dictionary with activation checkpoint configuration + """ + return { + "mode": mode, + "selective_ac_option": selective_ac_option, + } + + +def create_process_config( + procs: int = 8, + with_gpus: bool = True, + hosts: Optional[int] = None, +) -> Dict[str, Any]: + """ + Create process configuration. + + Args: + procs: Number of processes per host + with_gpus: Whether to use GPUs + hosts: Number of hosts (None = single node) + + Returns: + Dictionary with process configuration + """ + config = { + "procs": procs, + "with_gpus": with_gpus, + } + if hosts is not None: + config["hosts"] = hosts + return config + + +# ============================================================================ +# Configuration Assembly +# ============================================================================ + + +def build_config( + model_config: Dict[str, Any], + optimizer_config: Dict[str, Any], + lr_scheduler_config: Dict[str, Any], + training_config: Dict[str, Any], + parallelism_config: Dict[str, Any], + checkpoint_config: Dict[str, Any], + activation_checkpoint_config: Dict[str, Any], + process_config: Dict[str, Any], +) -> DictConfig: + """ + Build complete configuration from component configs. + + Args: + model_config: Model configuration + optimizer_config: Optimizer configuration + lr_scheduler_config: LR scheduler configuration + training_config: Training configuration + parallelism_config: Parallelism configuration + checkpoint_config: Checkpoint configuration + activation_checkpoint_config: Activation checkpoint configuration + process_config: Process configuration + + Returns: + Complete OmegaConf DictConfig + """ + config = { + "comm": {"trace_buf_size": 0}, + "model": model_config, + "optimizer": optimizer_config, + "lr_scheduler": lr_scheduler_config, + "training": training_config, + "parallelism": parallelism_config, + "checkpoint": checkpoint_config, + "activation_checkpoint": activation_checkpoint_config, + "processes": process_config, + } + + return OmegaConf.create(config) + + +# ============================================================================ +# Training Functions +# ============================================================================ + + +async def create_recipe(config: DictConfig): + """ + Create and return a ForgeSFTRecipe actor. + + Args: + config: Complete configuration + + Returns: + ForgeSFTRecipe actor instance + """ + process_cfg = config.pop("processes") + recipe = await ForgeSFTRecipe.options(**process_cfg).as_actor(config) + logger.info("Recipe created successfully") + return recipe + + +async def setup_recipe(recipe): + """ + Setup the recipe (load model, initialize data loaders, etc.). + + Args: + recipe: ForgeSFTRecipe actor instance + """ + logger.info("Setting up recipe...") + await recipe.setup.call() + logger.info("Recipe setup complete") + + +async def train_recipe(recipe): + """ + Run training on the recipe. + + Args: + recipe: ForgeSFTRecipe actor instance + """ + logger.info("Starting training...") + await recipe.train.call() + logger.info("Training complete") + + +async def cleanup_recipe(recipe): + """ + Cleanup recipe resources. + + Args: + recipe: ForgeSFTRecipe actor instance + """ + logger.info("Cleaning up...") + await recipe.cleanup.call() + await recipe.mesh.stop() + logger.info("Cleanup complete") + + +# ============================================================================ +# High-Level Training API +# ============================================================================ + + +async def run_training(config: DictConfig): + """ + Run complete training pipeline with the given configuration. + + Args: + config: Complete configuration + + Raises: + Exception: If training fails + """ + # Create recipe + recipe = await create_recipe(config) + + # Setup + await setup_recipe(recipe) + + # Train + await train_recipe(recipe) + + # Cleanup + await cleanup_recipe(recipe) + + +def train(config: DictConfig): + """ + Synchronous wrapper for run_training. + + Args: + config: Complete configuration + """ + asyncio.run(run_training(config)) + + +# ============================================================================ +# Display Utilities +# ============================================================================ + + +def print_config(config: DictConfig, title: str = "Configuration"): + """ + Pretty print configuration. + + Args: + config: Configuration to print + title: Title for the output + """ + print(f"\n{'='*60}") + print(f"{title:^60}") + print(f"{'='*60}") + print(OmegaConf.to_yaml(config)) + print(f"{'='*60}\n") + + +def summarize_config(config: DictConfig): + """ + Print a summary of the configuration. + + Args: + config: Configuration to summarize + """ + print("\n" + "=" * 60) + print("Configuration Summary".center(60)) + print("=" * 60) + + print(f"\n๐Ÿ“ฆ Model:") + print(f" โ€ข Name: {config.model.name}") + print(f" โ€ข Flavor: {config.model.flavor}") + print(f" โ€ข Path: {config.model.hf_assets_path}") + + print(f"\nโš™๏ธ Training:") + print(f" โ€ข Steps: {config.training.steps}") + print(f" โ€ข Batch Size: {config.training.local_batch_size}") + print(f" โ€ข Sequence Length: {config.training.seq_len}") + print(f" โ€ข Dataset: {config.training.dataset}") + + print(f"\n๐Ÿ”ง Optimizer:") + print(f" โ€ข Name: {config.optimizer.name}") + print(f" โ€ข Learning Rate: {config.optimizer.lr}") + print(f" โ€ข Warmup Steps: {config.lr_scheduler.warmup_steps}") + + print(f"\n๐Ÿ”€ Parallelism:") + print( + f" โ€ข Data Parallel (Replicate): {config.parallelism.data_parallel_replicate_degree}" + ) + print( + f" โ€ข Data Parallel (Shard/FSDP): {config.parallelism.data_parallel_shard_degree}" + ) + print(f" โ€ข Tensor Parallel: {config.parallelism.tensor_parallel_degree}") + print(f" โ€ข Pipeline Parallel: {config.parallelism.pipeline_parallel_degree}") + + print(f"\n๐Ÿ’พ Checkpointing:") + print(f" โ€ข Enabled: {config.checkpoint.enable}") + print(f" โ€ข Folder: {config.checkpoint.folder}") + print(f" โ€ข Interval: {config.checkpoint.interval} steps") + + print(f"\n๐Ÿ–ฅ๏ธ Resources:") + if "hosts" in config.processes: + print(f" โ€ข Hosts: {config.processes.hosts}") + print(f" โ€ข Processes per host: {config.processes.procs}") + print(f" โ€ข GPUs: {config.processes.with_gpus}") + + print("\n" + "=" * 60 + "\n") diff --git a/apps/sft_v2/sft_training_notebook.ipynb b/apps/sft_v2/sft_training_notebook.ipynb new file mode 100644 index 000000000..204ec15a9 --- /dev/null +++ b/apps/sft_v2/sft_training_notebook.ipynb @@ -0,0 +1,568 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ๐Ÿš€ SFT Training Notebook\n", + "\n", + "This notebook provides an interactive interface for training Language Models using Supervised Fine-Tuning (SFT).\n", + "\n", + "## Features\n", + "- โœ… Interactive configuration in separate cells\n", + "- โœ… Support for single-node and multi-node training\n", + "- โœ… Easy hyperparameter tuning\n", + "- โœ… Flexible parallelism strategies\n", + "- โœ… Checkpoint management\n", + "\n", + "## Quick Start\n", + "1. Configure each section (model, training, etc.)\n", + "2. Review the complete configuration\n", + "3. Run training!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ๐Ÿ“š Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '/home/hosseinkh/forge')\n", + "\n", + "from apps.sft_v2 import notebook_utils as nb\n", + "import torch\n", + "\n", + "print(f\"โœ… Imports successful!\")\n", + "print(f\"๐Ÿ“Š PyTorch version: {torch.__version__}\")\n", + "print(f\"๐ŸŽฎ CUDA available: {torch.cuda.is_available()}\")\n", + "if torch.cuda.is_available():\n", + " print(f\"๐Ÿ”ข Number of GPUs: {torch.cuda.device_count()}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ๐Ÿ“ฆ Model Configuration\n", + "\n", + "Configure the model you want to train." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Model Configuration\n", + "model_config = nb.create_model_config(\n", + " name=\"llama3\",\n", + " flavor=\"8B\",\n", + " hf_assets_path=\"/mnt/home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct\"\n", + ")\n", + "\n", + "print(\"๐Ÿ“ฆ Model Configuration:\")\n", + "for key, value in model_config.items():\n", + " print(f\" โ€ข {key}: {value}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## โš™๏ธ Training Configuration\n", + "\n", + "Set training hyperparameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Training Configuration\n", + "training_config = nb.create_training_config(\n", + " local_batch_size=1, # Batch size per GPU\n", + " seq_len=2048, # Sequence length\n", + " max_norm=1.0, # Gradient clipping\n", + " steps=1000, # Total training steps\n", + " dataset=\"c4\", # Dataset name\n", + " compile=False # Use torch.compile?\n", + ")\n", + "\n", + "print(\"โš™๏ธ Training Configuration:\")\n", + "for key, value in training_config.items():\n", + " print(f\" โ€ข {key}: {value}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ๐Ÿ”ง Optimizer Configuration\n", + "\n", + "Configure the optimizer and learning rate." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Optimizer Configuration\n", + "optimizer_config = nb.create_optimizer_config(\n", + " name=\"AdamW\",\n", + " lr=1e-5, # Learning rate\n", + " eps=1e-8, # Epsilon\n", + " weight_decay=0.0, # Weight decay\n", + " betas=(0.9, 0.999) # Adam betas\n", + ")\n", + "\n", + "# LR Scheduler Configuration\n", + "lr_scheduler_config = nb.create_lr_scheduler_config(\n", + " warmup_steps=200, # Warmup steps\n", + " decay_steps=None, # Decay steps (None = no decay)\n", + " min_lr=0.0 # Minimum LR\n", + ")\n", + "\n", + "print(\"๐Ÿ”ง Optimizer Configuration:\")\n", + "for key, value in optimizer_config.items():\n", + " print(f\" โ€ข {key}: {value}\")\n", + "\n", + "print(\"\\n๐Ÿ“ˆ LR Scheduler Configuration:\")\n", + "for key, value in lr_scheduler_config.items():\n", + " print(f\" โ€ข {key}: {value}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ๐Ÿ”€ Parallelism Configuration\n", + "\n", + "Configure distributed training strategies.\n", + "\n", + "### Parallelism Options:\n", + "- **Data Parallel (Replicate)**: Basic data parallelism\n", + "- **Data Parallel (Shard/FSDP)**: Fully Sharded Data Parallel (-1 = use all GPUs)\n", + "- **Tensor Parallel**: Split model across multiple GPUs\n", + "- **Pipeline Parallel**: Split model stages across GPUs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Parallelism Configuration\n", + "parallelism_config = nb.create_parallelism_config(\n", + " data_parallel_replicate_degree=1, # DP replicate\n", + " data_parallel_shard_degree=-1, # FSDP (-1 = auto, uses all GPUs)\n", + " tensor_parallel_degree=1, # TP\n", + " pipeline_parallel_degree=1, # PP\n", + " context_parallel_degree=1, # CP\n", + " expert_parallel_degree=1, # EP (for MoE)\n", + " disable_loss_parallel=False\n", + ")\n", + "\n", + "print(\"๐Ÿ”€ Parallelism Configuration:\")\n", + "for key, value in parallelism_config.items():\n", + " print(f\" โ€ข {key}: {value}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ๐Ÿ’พ Checkpoint Configuration\n", + "\n", + "Configure model checkpointing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Checkpoint Configuration\n", + "checkpoint_config = nb.create_checkpoint_config(\n", + " enable=True,\n", + " folder=\"/mnt/home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct/saved_checkpoints\",\n", + " initial_load_path=\"/mnt/home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct/\",\n", + " initial_load_in_hf=True,\n", + " last_save_in_hf=True,\n", + " interval=500, # Save every N steps\n", + " async_mode=\"disabled\"\n", + ")\n", + "\n", + "# Activation Checkpoint Configuration (for memory efficiency)\n", + "activation_checkpoint_config = nb.create_activation_checkpoint_config(\n", + " mode=\"selective\", # 'selective', 'full', or 'none'\n", + " selective_ac_option=\"op\" # 'op' or 'layer'\n", + ")\n", + "\n", + "print(\"๐Ÿ’พ Checkpoint Configuration:\")\n", + "for key, value in checkpoint_config.items():\n", + " print(f\" โ€ข {key}: {value}\")\n", + "\n", + "print(\"\\n๐Ÿ”„ Activation Checkpoint Configuration:\")\n", + "for key, value in activation_checkpoint_config.items():\n", + " print(f\" โ€ข {key}: {value}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ๐Ÿ–ฅ๏ธ Resource Configuration\n", + "\n", + "Configure compute resources.\n", + "\n", + "### Options:\n", + "- **Single Node**: Set only `procs` (number of GPUs)\n", + "- **Multi Node**: Set both `hosts` (number of nodes) and `procs` (GPUs per node)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Choose ONE of the following:\n", + "\n", + "# Option 1: Single Node (8 GPUs)\n", + "process_config = nb.create_process_config(\n", + " procs=8,\n", + " with_gpus=True,\n", + " hosts=None # None = single node\n", + ")\n", + "\n", + "# Option 2: Multi-Node (4 nodes ร— 8 GPUs = 32 total)\n", + "# Uncomment to use:\n", + "# process_config = nb.create_process_config(\n", + "# procs=8,\n", + "# with_gpus=True,\n", + "# hosts=4\n", + "# )\n", + "\n", + "print(\"๐Ÿ–ฅ๏ธ Resource Configuration:\")\n", + "for key, value in process_config.items():\n", + " print(f\" โ€ข {key}: {value}\")\n", + "\n", + "if \"hosts\" in process_config and process_config[\"hosts\"]:\n", + " total_gpus = process_config[\"hosts\"] * process_config[\"procs\"]\n", + " print(f\"\\n๐Ÿ“Š Total GPUs: {total_gpus}\")\n", + "else:\n", + " print(f\"\\n๐Ÿ“Š Total GPUs: {process_config['procs']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## โ˜๏ธ Provisioner Configuration (Optional)\n", + "\n", + "**Only needed for multi-node training on SLURM clusters.**\n", + "\n", + "โš ๏ธ Skip this cell if you're running single-node training!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Provisioner Configuration (OPTIONAL - for multi-node only)\n", + "# Set to None for single-node training\n", + "\n", + "provisioner_config = None # Default: no provisioner\n", + "\n", + "# Uncomment and configure for SLURM multi-node training:\n", + "# provisioner_config = nb.create_provisioner_config(\n", + "# launcher=\"slurm\",\n", + "# job_name=\"sft_training\",\n", + "# partition=\"your_gpu_partition\", # REQUIRED for SLURM\n", + "# time=\"24:00:00\", # REQUIRED for SLURM\n", + "# account=\"your_account\" # May be required\n", + "# )\n", + "\n", + "if provisioner_config:\n", + " print(\"โ˜๏ธ Provisioner Configuration:\")\n", + " for key, value in provisioner_config.items():\n", + " print(f\" โ€ข {key}: {value}\")\n", + "else:\n", + " print(\"โ˜๏ธ Provisioner: Disabled (single-node mode)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ๐Ÿ”จ Build Complete Configuration\n", + "\n", + "Combine all configurations into a single config object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Build complete configuration\n", + "config = nb.build_config(\n", + " model_config=model_config,\n", + " optimizer_config=optimizer_config,\n", + " lr_scheduler_config=lr_scheduler_config,\n", + " training_config=training_config,\n", + " parallelism_config=parallelism_config,\n", + " checkpoint_config=checkpoint_config,\n", + " activation_checkpoint_config=activation_checkpoint_config,\n", + " process_config=process_config,\n", + " provisioner_config=provisioner_config\n", + ")\n", + "\n", + "print(\"โœ… Configuration built successfully!\\n\")\n", + "\n", + "# Display summary\n", + "nb.summarize_config(config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ๐Ÿ“„ View Full Configuration (YAML)\n", + "\n", + "See the complete configuration in YAML format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Print full configuration\n", + "nb.print_config(config, title=\"Complete Training Configuration\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ๐Ÿ’พ Save Configuration (Optional)\n", + "\n", + "Save the configuration to a YAML file for later use." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from omegaconf import OmegaConf\n", + "\n", + "# Save configuration\n", + "config_path = \"/home/hosseinkh/forge/apps/sft_v2/my_training_config.yaml\"\n", + "with open(config_path, 'w') as f:\n", + " OmegaConf.save(config, f)\n", + "\n", + "print(f\"โœ… Configuration saved to: {config_path}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ๐Ÿš€ Run Training!\n", + "\n", + "Start the training process with the configured settings.\n", + "\n", + "โš ๏ธ **Note**: This will start actual training and may take a long time!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run training\n", + "print(\"๐Ÿš€ Starting training...\\n\")\n", + "\n", + "try:\n", + " nb.train(config)\n", + " print(\"\\nโœ… Training completed successfully!\")\n", + "except Exception as e:\n", + " print(f\"\\nโŒ Training failed: {e}\")\n", + " import traceback\n", + " traceback.print_exc()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ๐Ÿ” Advanced: Step-by-Step Execution\n", + "\n", + "For more control, you can run each training stage separately.\n", + "\n", + "โš ๏ธ **Only run this section if you want manual control. Otherwise, use the cell above.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 1: Initialize provisioner (if configured)\n", + "import asyncio\n", + "\n", + "provisioner_initialized = await nb.initialize_provisioner(config)\n", + "print(f\"Provisioner initialized: {provisioner_initialized}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 2: Create recipe\n", + "recipe = await nb.create_recipe(config)\n", + "print(\"Recipe created\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 3: Setup recipe (load model, data, etc.)\n", + "await nb.setup_recipe(recipe)\n", + "print(\"Recipe setup complete\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 4: Run training\n", + "await nb.train_recipe(recipe)\n", + "print(\"Training complete\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 5: Cleanup\n", + "await nb.cleanup_recipe(recipe)\n", + "print(\"Cleanup complete\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 6: Shutdown provisioner (if initialized)\n", + "if provisioner_initialized:\n", + " await nb.shutdown_provisioner(config)\n", + " print(\"Provisioner shutdown complete\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ๐Ÿ“Š Tips & Tricks\n", + "\n", + "### Memory Optimization\n", + "- Use **FSDP** (set `data_parallel_shard_degree=-1`) for large models\n", + "- Enable **activation checkpointing** (set `mode=\"selective\"` or `\"full\"`)\n", + "- Reduce **batch size** or **sequence length**\n", + "\n", + "### Speed Optimization\n", + "- Use **tensor parallelism** for large models (set `tensor_parallel_degree > 1`)\n", + "- Enable **compilation** (set `compile=True`)\n", + "- Increase **batch size** if memory allows\n", + "\n", + "### Multi-Node Training\n", + "- Set `hosts` in process config\n", + "- Configure provisioner with SLURM details\n", + "- Make sure model path is accessible on all nodes\n", + "\n", + "### Debugging\n", + "- Start with fewer steps (e.g., `steps=10`)\n", + "- Use single GPU first (`procs=1`)\n", + "- Check logs for errors" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ๐ŸŽฏ Common Configurations\n", + "\n", + "### Quick Test Run\n", + "```python\n", + "training_config = nb.create_training_config(\n", + " steps=10,\n", + " local_batch_size=1\n", + ")\n", + "process_config = nb.create_process_config(procs=1)\n", + "```\n", + "\n", + "### Single Node, 8 GPUs, FSDP\n", + "```python\n", + "parallelism_config = nb.create_parallelism_config(\n", + " data_parallel_shard_degree=-1 # Use all 8 GPUs with FSDP\n", + ")\n", + "process_config = nb.create_process_config(procs=8)\n", + "```\n", + "\n", + "### Multi-Node, 4ร—8 GPUs, TP=2\n", + "```python\n", + "parallelism_config = nb.create_parallelism_config(\n", + " data_parallel_shard_degree=16, # 32 GPUs / 2 TP = 16 FSDP\n", + " tensor_parallel_degree=2\n", + ")\n", + "process_config = nb.create_process_config(procs=8, hosts=4)\n", + "provisioner_config = nb.create_provisioner_config(\n", + " launcher=\"slurm\",\n", + " partition=\"gpu_partition\"\n", + ")\n", + "```" + ] + } + ], + "metadata": { + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} From baeb35b26a4ee495cabfaed5391c0a49e9807733 Mon Sep 17 00:00:00 2001 From: Hossein Kavianihamedani Date: Thu, 9 Oct 2025 13:29:23 -0700 Subject: [PATCH 2/7] Submitting an interactive notebook to run SFT --- apps/sft_v2/NOTEBOOK_GUIDE.md | 847 ++++++++++++++++++ apps/sft_v2/README_NOTEBOOK.md | 435 --------- apps/sft_v2/actor.py | 133 +++ apps/sft_v2/interactive_config_notebook.ipynb | 629 +++++++++++++ apps/sft_v2/notebook_utils.py | 463 ---------- apps/sft_v2/sft_training_notebook.ipynb | 568 ------------ apps/sft_v2/spawn_actor.py | 139 +++ apps/sft_v2/trainer_actor.py | 189 ++++ apps/sft_v2/utils.py | 187 ++++ 9 files changed, 2124 insertions(+), 1466 deletions(-) create mode 100644 apps/sft_v2/NOTEBOOK_GUIDE.md delete mode 100644 apps/sft_v2/README_NOTEBOOK.md create mode 100644 apps/sft_v2/actor.py create mode 100644 apps/sft_v2/interactive_config_notebook.ipynb delete mode 100644 apps/sft_v2/notebook_utils.py delete mode 100644 apps/sft_v2/sft_training_notebook.ipynb create mode 100644 apps/sft_v2/spawn_actor.py create mode 100644 apps/sft_v2/trainer_actor.py create mode 100644 apps/sft_v2/utils.py diff --git a/apps/sft_v2/NOTEBOOK_GUIDE.md b/apps/sft_v2/NOTEBOOK_GUIDE.md new file mode 100644 index 000000000..b3524ed31 --- /dev/null +++ b/apps/sft_v2/NOTEBOOK_GUIDE.md @@ -0,0 +1,847 @@ +# Complete Guide: Interactive Configuration Notebook + +This guide explains step-by-step how to use the interactive configuration notebook for SFT training. + +--- + +## ๐Ÿ“– Table of Contents + +1. [Overview](#overview) +2. [Architecture Components](#architecture-components) +3. [Notebook Step-by-Step](#notebook-step-by-step) +4. [Utility Functions Explained](#utility-functions-explained) +5. [How to Run](#how-to-run) +6. [Common Scenarios](#common-scenarios) +7. [Troubleshooting](#troubleshooting) + +--- + +## Overview + +The interactive configuration notebook (`interactive_config_notebook.ipynb`) allows you to: +- Configure SFT training **without YAML files** +- Define configuration interactively in separate cells +- Easily modify parameters and experiment +- Use pre-built templates for common scenarios + +### What Problem Does This Solve? + +**Before**: You had to edit YAML files, which required: +- External file management +- Reloading files after changes +- Difficult to experiment with different configs + +**After**: You can: +- Define everything in the notebook +- Change values in cells and re-run +- See all configurations clearly +- No external file management needed + +--- + +## Architecture Components + +Before diving into the notebook, let's understand the components: + +### 1. BaseForgeActor (`actor.py`) + +**What it is**: An abstract base class that defines the contract for all actors. + +**What it does**: +- Handles distributed initialization (sets up multi-GPU environment) +- Manages common attributes (model, optimizer, checkpointer, etc.) +- Defines three required methods that subclasses must implement: + - `setup()` - Initialize data, checkpoints, etc. + - `run()` - Main execution logic + - `cleanup()` - Resource cleanup + +**Why it matters**: Provides a consistent interface for different actor types (Trainer, Evaluator, Inferencer, etc.) + +### 2. TrainerActor (`trainer_actor.py`) + +**What it is**: A concrete implementation of BaseForgeActor for training. + +**What it does**: +- Implements the training loop +- Handles forward/backward passes +- Manages checkpointing +- Supports various parallelism strategies (FSDP, Pipeline Parallel, Tensor Parallel) + +**Key Methods**: +- `setup()` - Loads tokenizer, dataset, and checkpoints +- `run()` - Executes the training loop +- `forward_backward()` - Performs forward and backward passes +- `train_step()` - Single training step +- `cleanup()` - Closes resources + +### 3. SpawnActor (`spawn_actor.py`) + +**What it is**: An orchestrator that manages actor lifecycle. + +**What it does**: +- Creates actor instances +- Manages the lifecycle: spawn โ†’ setup โ†’ run โ†’ cleanup +- Provides error handling and cleanup guarantees + +**Key Methods**: +- `spawn()` - Creates the actor instance +- `setup()` - Calls actor's setup +- `run()` - Calls actor's run +- `cleanup()` - Calls actor's cleanup and stops the mesh +- `run_full_lifecycle()` - Executes all phases automatically + +**Why it matters**: Simplifies actor management and ensures proper resource cleanup. + +### 4. Utility Functions (`utils.py`) + +Helper functions for common operations. See [Utility Functions Explained](#utility-functions-explained) section below. + +--- + +## Notebook Step-by-Step + +### Step 1: Import Dependencies + +```python +import asyncio +import logging +from omegaconf import OmegaConf, DictConfig + +from forge.apps.sft_v2.trainer_actor import TrainerActor +from forge.apps.sft_v2.spawn_actor import SpawnActor, run_actor +``` + +**What this does**: +- `asyncio` - For async/await operations (actors run asynchronously) +- `logging` - For logging training progress +- `OmegaConf` - For managing configurations (converts dicts to config objects) +- `TrainerActor` - The training actor we'll use +- `SpawnActor`, `run_actor` - For managing actor lifecycle + +**Why we need it**: These are the core dependencies for running the actor-based training. + +--- + +### Step 2: Configure Model Settings + +```python +model_config = { + "name": "llama3", + "flavor": "8B", + "hf_assets_path": "/tmp/Meta-Llama-3.1-8B-Instruct" +} +``` + +**What this does**: +- `name` - Model architecture type (e.g., "llama3", "llama2") +- `flavor` - Model size (e.g., "8B", "70B", "405B") +- `hf_assets_path` - Path to the model files (tokenizer, weights, config) + +**How to modify**: +- Change `flavor` to use different model sizes +- Update `hf_assets_path` to point to your model location +- Make sure the path contains `tokenizer.json`, `tokenizer_config.json`, and model weights + +**Example variations**: +```python +# For a 70B model +model_config = { + "name": "llama3", + "flavor": "70B", + "hf_assets_path": "/path/to/Meta-Llama-3.1-70B" +} +``` + +--- + +### Step 3: Configure Process Settings + +```python +processes_config = { + "procs": 8, # Number of processes + "with_gpus": True # Use GPUs +} +``` + +**What this does**: +- `procs` - Number of parallel processes (usually = number of GPUs) +- `with_gpus` - Whether to use GPUs or CPUs + +**How to modify**: +- For single GPU: `"procs": 1` +- For 4 GPUs: `"procs": 4` +- For CPU training: `"with_gpus": False` (not recommended for LLMs) + +**Important**: Set `procs` to match your available GPUs! + +--- + +### Step 4: Configure Optimizer Settings + +```python +optimizer_config = { + "name": "AdamW", + "lr": 1e-5, # Learning rate + "eps": 1e-8 +} +``` + +**What this does**: +- `name` - Optimizer type (AdamW is recommended for LLMs) +- `lr` - Learning rate (how fast the model learns) +- `eps` - Epsilon for numerical stability + +**How to modify**: +- **Lower learning rate** (e.g., `1e-6`) for fine-tuning +- **Higher learning rate** (e.g., `5e-5`) for pre-training (use with caution) +- Typical range for fine-tuning: `1e-6` to `1e-4` + +**Tips**: +- Start conservative with `1e-5` or `2e-5` +- If loss explodes, reduce learning rate +- If training is too slow, slightly increase learning rate + +--- + +### Step 5: Configure Learning Rate Scheduler + +```python +lr_scheduler_config = { + "warmup_steps": 200 # Number of warmup steps +} +``` + +**What this does**: +- `warmup_steps` - Number of steps to gradually increase learning rate from 0 to `lr` + +**Why warmup**: Prevents training instability at the beginning by starting with a low learning rate. + +**How to modify**: +- For short training (< 1000 steps): use 10-50 warmup steps +- For medium training (1000-5000 steps): use 100-200 warmup steps +- For long training (> 5000 steps): use 200-500 warmup steps +- Rule of thumb: ~5-10% of total training steps + +--- + +### Step 6: Configure Training Settings + +```python +training_config = { + "local_batch_size": 1, # Batch size per GPU + "seq_len": 2048, # Sequence length + "max_norm": 1.0, # Gradient clipping + "steps": 1000, # Total training steps + "compile": False, # PyTorch compilation + "dataset": "c4" # Dataset name +} +``` + +**What this does**: +- `local_batch_size` - Number of samples per GPU per step +- `seq_len` - Maximum sequence length (in tokens) +- `max_norm` - Gradient clipping threshold (prevents exploding gradients) +- `steps` - Total number of training steps +- `compile` - Enable PyTorch 2.0 compilation (experimental) +- `dataset` - Dataset identifier + +**How to modify**: + +**For Memory Issues**: +- Reduce `seq_len` (e.g., from 2048 to 1024) +- Reduce `local_batch_size` (e.g., from 2 to 1) +- Both reduce memory usage + +**For Faster Training**: +- Increase `local_batch_size` if you have memory +- Use shorter `seq_len` for tasks that don't need long context + +**For Quick Testing**: +- Set `steps` to 10-100 for quick validation + +**Global batch size** = `local_batch_size` ร— `procs` ร— `data_parallel_shard_degree` + +--- + +### Step 7: Configure Parallelism Settings + +```python +parallelism_config = { + "data_parallel_replicate_degree": 1, + "data_parallel_shard_degree": -1, # -1 = use all GPUs for FSDP + "tensor_parallel_degree": 1, + "pipeline_parallel_degree": 1, + "context_parallel_degree": 1, + "expert_parallel_degree": 1, + "disable_loss_parallel": False +} +``` + +**What this does**: + +- **Data Parallel Shard Degree (FSDP)**: Splits model parameters across GPUs + - `-1` means use all available GPUs + - `8` means split across 8 GPUs + - Most common strategy for fine-tuning + +- **Tensor Parallel Degree**: Splits individual layers across GPUs + - Use for very large models that don't fit on single GPU even with FSDP + - `1` means no tensor parallelism + +- **Pipeline Parallel Degree**: Splits model into sequential stages + - Use for extremely large models + - `1` means no pipeline parallelism + +- **Context Parallel Degree**: Splits sequence dimension + - For very long sequences + - `1` means no context parallelism + +**Common Configurations**: + +**Single GPU**: +```python +"data_parallel_shard_degree": 1 +``` + +**8 GPUs with FSDP (recommended)**: +```python +"data_parallel_shard_degree": -1 # or 8 +``` + +**Large Model (70B+) with Tensor Parallelism**: +```python +"data_parallel_shard_degree": 4, +"tensor_parallel_degree": 2 +``` + +--- + +### Step 8: Configure Checkpoint Settings + +```python +checkpoint_config = { + "enable": True, + "folder": "/tmp/Meta-Llama-3.1-8B-Instruct/saved_checkpoints", + "initial_load_path": "/tmp/Meta-Llama-3.1-8B-Instruct/", + "initial_load_in_hf": True, + "last_save_in_hf": True, + "interval": 500, # Save every N steps + "async_mode": "disabled" +} +``` + +**What this does**: +- `enable` - Whether to enable checkpointing +- `folder` - Where to save checkpoints +- `initial_load_path` - Where to load initial weights from +- `initial_load_in_hf` - Load weights in HuggingFace format +- `last_save_in_hf` - Save final checkpoint in HuggingFace format +- `interval` - How often to save (in steps) +- `async_mode` - Async saving mode (use "disabled" for simplicity) + +**How to modify**: +- **Save more frequently**: Reduce `interval` (e.g., 100) +- **Save less frequently**: Increase `interval` (e.g., 1000) +- **Resume training**: Point `initial_load_path` to your checkpoint folder + +**Important**: Make sure `folder` path exists and has enough disk space! + +--- + +### Step 9: Configure Activation Checkpointing + +```python +activation_checkpoint_config = { + "mode": "selective", + "selective_ac_option": "op" +} +``` + +**What this does**: +- Saves memory by recomputing activations during backward pass instead of storing them +- `mode` - Checkpointing mode ("selective" or "full") +- `selective_ac_option` - Which operations to checkpoint + +**Memory vs Speed Trade-off**: +- **Activation checkpointing ON**: Lower memory, slower training +- **Activation checkpointing OFF**: Higher memory, faster training + +**When to use**: Enable when running out of memory. + +--- + +### Step 10: Configure Communication Settings + +```python +comm_config = { + "trace_buf_size": 0 +} +``` + +**What this does**: +- Configuration for distributed communication (required by TorchTitan) +- Usually you don't need to modify this + +--- + +### Step 11: Combine All Configurations + +```python +complete_config = { + "comm": comm_config, + "model": model_config, + "processes": processes_config, + "optimizer": optimizer_config, + "lr_scheduler": lr_scheduler_config, + "training": training_config, + "parallelism": parallelism_config, + "checkpoint": checkpoint_config, + "activation_checkpoint": activation_checkpoint_config +} + +cfg = OmegaConf.create(complete_config) +``` + +**What this does**: +- Combines all configuration sections into one complete config +- Converts to OmegaConf format (allows dot notation access) + +**Prints**: The complete configuration in YAML format for review + +--- + +### Step 12: Run Training (Simple Way) + +```python +await run_actor(TrainerActor, cfg) +``` + +**What this does**: +- Spawns the trainer actor +- Runs setup (loads data, model, checkpoints) +- Runs training loop +- Cleans up resources +- All in one line! + +**When to use**: When you want fully automatic training with no manual intervention. + +--- + +### Alternative: Manual Lifecycle Control + +For more control over the training process: + +#### Create and Spawn the Actor + +```python +spawner = SpawnActor(TrainerActor, cfg) +actor = await spawner.spawn() +``` + +**What this does**: +- Creates a spawner with your config +- Spawns the actor instance (allocates resources, initializes distributed environment) + +#### Setup the Actor + +```python +await spawner.setup() +``` + +**What this does**: +- Loads tokenizer from `hf_assets_path` +- Loads training dataset +- Initializes model +- Loads checkpoint if specified + +**At this point**: You could inspect the actor state before training: +```python +print(f"Current step: {actor.current_step}") +print(f"Device: {actor.device}") +``` + +#### Run Training + +```python +await spawner.run() +``` + +**What this does**: +- Executes the training loop +- Iterates through batches +- Performs forward/backward passes +- Updates weights +- Saves checkpoints at intervals + +#### Cleanup + +```python +await spawner.cleanup() +``` + +**What this does**: +- Closes checkpointer +- Closes logger +- Stops the actor mesh +- Frees resources + +**When to use manual control**: +- When you want to inspect state between phases +- When you want to modify configuration between setup and run +- For debugging purposes + +--- + +## Utility Functions Explained + +The `utils.py` module provides reusable helper functions: + +### 1. `setup_tokenizer()` + +```python +def setup_tokenizer( + hf_assets_path: str, + tokenizer_filename: str = "tokenizer.json", + tokenizer_config_filename: str = "tokenizer_config.json", + generation_config_filename: str = "generation_config.json", +) -> HuggingFaceModelTokenizer +``` + +**What it does**: +- Loads a HuggingFace tokenizer from the model assets directory +- Initializes tokenizer with config and generation settings + +**Parameters**: +- `hf_assets_path` - Path to directory containing tokenizer files +- Other parameters are filenames (usually don't need to change) + +**Returns**: Initialized `HuggingFaceModelTokenizer` object + +**Example**: +```python +tokenizer = setup_tokenizer("/tmp/Meta-Llama-3.1-8B-Instruct") +``` + +**When to use**: If you need to use the tokenizer independently (e.g., for preprocessing data) + +--- + +### 2. `setup_sft_dataloader()` + +```python +def setup_sft_dataloader( + tokenizer: HuggingFaceModelTokenizer, + dataset_path: str, + dataset_split: str, + target_tokens_per_pack: int, + batch_size: int, + device: torch.device, + padding_idx: int = 0, + message_transform: Optional[Any] = None, +) -> StatefulDataLoader +``` + +**What it does**: +- Creates a dataloader for supervised fine-tuning +- Handles data loading, tokenization, and packing +- Returns a StatefulDataLoader (can save/restore state for checkpointing) + +**Parameters**: +- `tokenizer` - Tokenizer to use for text processing +- `dataset_path` - HuggingFace dataset name (e.g., "yahma/alpaca-cleaned") +- `dataset_split` - Which split to use ("train", "validation", "test") +- `target_tokens_per_pack` - Sequence length (same as `seq_len` in config) +- `batch_size` - Batch size (same as `local_batch_size` in config) +- `device` - Which device to move tensors to +- `padding_idx` - Token ID for padding (usually 0) +- `message_transform` - Transform to convert dataset format (default: AlpacaToMessages) + +**Returns**: Configured `StatefulDataLoader` + +**Example**: +```python +dataloader = setup_sft_dataloader( + tokenizer=tokenizer, + dataset_path="yahma/alpaca-cleaned", + dataset_split="train", + target_tokens_per_pack=2048, + batch_size=4, + device=torch.device("cuda"), +) +``` + +**When to use**: If you want to create a custom dataloader outside of TrainerActor + +--- + +### 3. `create_context_parallel_context()` + +```python +def create_context_parallel_context( + parallel_dims: ParallelDims, + inputs: torch.Tensor, + labels: torch.Tensor, + model_parts: list, + rotate_method: str, +) +``` + +**What it does**: +- Creates context for context parallelism (splits sequence across GPUs) +- Returns None if context parallelism is disabled + +**Parameters**: +- `parallel_dims` - Parallel dimensions configuration +- `inputs` - Input tensor +- `labels` - Label tensor +- `model_parts` - List of model parts +- `rotate_method` - Rotation method for context parallel + +**Returns**: Context parallel context or None + +**When to use**: Internally used by TrainerActor. You rarely need to call this directly. + +--- + +### 4. `move_batch_to_device()` + +```python +def move_batch_to_device(batch: dict[str, Any], device: torch.device) -> dict[str, Any] +``` + +**What it does**: +- Moves all tensors in a batch dictionary to the specified device +- Leaves non-tensor values unchanged + +**Parameters**: +- `batch` - Dictionary containing batch data +- `device` - Target device (e.g., `torch.device("cuda")`) + +**Returns**: Batch with tensors moved to device + +**Example**: +```python +batch = {"tokens": tensor, "labels": tensor, "metadata": "some_string"} +batch = move_batch_to_device(batch, torch.device("cuda")) +``` + +**When to use**: Useful when manually processing batches + +--- + +### 5. `log_training_step()` + +```python +def log_training_step( + step: int, + total_steps: int, + loss: torch.Tensor, + logger: logging.Logger, +) +``` + +**What it does**: +- Logs training progress in a formatted way +- Shows current step, total steps, and loss value + +**Parameters**: +- `step` - Current training step +- `total_steps` - Total number of training steps +- `loss` - Current loss tensor +- `logger` - Logger instance + +**Example output**: +``` +Step 100/1000 | Loss: 2.3456 +``` + +**When to use**: Internally used by TrainerActor. You can use it for custom logging. + +--- + +## How to Run + +### Prerequisites + +1. **Download Model**: +```bash +export HF_HUB_DISABLE_XET=1 +forge download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir /tmp/Meta-Llama-3.1-8B-Instruct +``` + +2. **Check GPU Availability**: +```bash +nvidia-smi # Should show your GPUs +``` + +### Running the Notebook + +#### Option 1: Using Jupyter Notebook + +1. **Start Jupyter**: +```bash +cd /home/hosseinkh/TorchForge/forge +jupyter notebook +``` + +2. **Open the notebook**: + - Navigate to `apps/sft_v2/interactive_config_notebook.ipynb` + - Click to open + +3. **Run cells sequentially**: + - Click on first cell, press `Shift + Enter` to run + - Continue through all cells + - Modify configuration cells as needed + - Run Step 12 to start training + +#### Option 2: Using VS Code + +1. **Open notebook in VS Code**: + - File โ†’ Open โ†’ `interactive_config_notebook.ipynb` + +2. **Select Python kernel**: + - Click "Select Kernel" in top right + - Choose your Python environment + +3. **Run cells**: + - Click "Run Cell" button on each cell + - Or press `Shift + Enter` + +#### Option 3: Using Command Line (with simplified entry point) + +```bash +cd /home/hosseinkh/TorchForge/forge +python -m apps.sft_v2.notebook_main --config apps/sft_v2/llama3_8b.yaml +``` + +Note: This uses a YAML file, but you can use the notebook for interactive config. + +--- + +## Common Scenarios + +### Scenario 1: Quick Test (1 GPU, 100 steps) + +```python +# Modify these cells: +processes_config = {"procs": 1, "with_gpus": True} +training_config = { + "local_batch_size": 1, + "seq_len": 1024, + "steps": 100, # Just 100 steps + ... +} +``` + +**Expected time**: 5-10 minutes on A100 + +### Scenario 2: Full Training (8 GPUs, 5000 steps) + +```python +processes_config = {"procs": 8, "with_gpus": True} +training_config = { + "local_batch_size": 2, + "seq_len": 2048, + "steps": 5000, + ... +} +parallelism_config = { + "data_parallel_shard_degree": -1, # Use all 8 GPUs + ... +} +``` + +**Expected time**: Several hours depending on hardware + +### Scenario 3: Memory-Constrained Training + +```python +training_config = { + "local_batch_size": 1, # Small batch + "seq_len": 1024, # Shorter sequence + ... +} +activation_checkpoint_config = { + "mode": "selective", # Enable AC for memory savings + ... +} +``` + +**Use when**: Running out of GPU memory + +### Scenario 4: Resume from Checkpoint + +```python +checkpoint_config = { + "enable": True, + "folder": "/path/to/previous/checkpoints", + "initial_load_path": "/path/to/previous/checkpoints/step_1000", + "interval": 500, + ... +} +``` + +**Use when**: Continuing training from a saved checkpoint + +--- + +## Troubleshooting + +### Problem: "CUDA out of memory" + +**Solutions**: +1. Reduce `seq_len` (e.g., from 2048 to 1024) +2. Reduce `local_batch_size` (e.g., from 2 to 1) +3. Enable activation checkpointing +4. Use more GPUs with FSDP + +### Problem: "Loss is NaN or exploding" + +**Solutions**: +1. Reduce learning rate (e.g., from `1e-5` to `1e-6`) +2. Increase gradient clipping (`max_norm` from 1.0 to 0.5) +3. Increase warmup steps + +### Problem: "Training is too slow" + +**Solutions**: +1. Increase `local_batch_size` if memory allows +2. Use more GPUs +3. Reduce `seq_len` if your task doesn't need long context +4. Enable compilation (`compile: True`) + +### Problem: "Cannot find tokenizer files" + +**Solutions**: +1. Check `hf_assets_path` is correct +2. Ensure path contains `tokenizer.json` and `tokenizer_config.json` +3. Re-download model if files are missing + +### Problem: "Actor spawning fails" + +**Solutions**: +1. Check you have enough GPUs for `procs` +2. Verify CUDA is available (`torch.cuda.is_available()`) +3. Check no other processes are using GPUs + +--- + +## Summary + +**Key Takeaways**: + +1. **Interactive Configuration**: Define all settings in notebook cells, no YAML needed +2. **Step-by-Step**: Configure model, processes, optimizer, training, parallelism, checkpoints separately +3. **Two Ways to Run**: Simple (`run_actor()`) or manual (lifecycle control) +4. **Utility Functions**: Helper functions for tokenization, data loading, device management +5. **Templates Provided**: Quick test, multi-GPU, memory-efficient configs ready to use +6. **Flexible**: Easy to modify parameters and experiment + +**Next Steps**: +1. Download your model +2. Open the notebook +3. Modify configuration cells for your needs +4. Run Step 12 to start training +5. Monitor logs for progress + +Happy Training! ๐Ÿš€ diff --git a/apps/sft_v2/README_NOTEBOOK.md b/apps/sft_v2/README_NOTEBOOK.md deleted file mode 100644 index eb70a29ea..000000000 --- a/apps/sft_v2/README_NOTEBOOK.md +++ /dev/null @@ -1,435 +0,0 @@ -# ๐Ÿš€ SFT Training Notebook Guide - -This directory contains an interactive Jupyter notebook experience for training Language Models with Supervised Fine-Tuning (SFT). - -## ๐Ÿ“ Files - -### Core Files -- **`sft_training_notebook.ipynb`** - Main Jupyter notebook for interactive training -- **`notebook_utils.py`** - Utility functions for notebook-based training -- **`main.py`** - Original command-line training script (unchanged) - -### Configuration Files -- **`llama3_8b.yaml`** - Original single-node config -- **`llama3_8b_single_node.yaml`** - Single-node config without provisioner -- **`llama3_8b_slurm_multinode.yaml`** - Multi-node config with SLURM -- **`llama3_8b_local.yaml`** - Local testing config - -## ๐ŸŽฏ Quick Start - -### 1. Open the Notebook - -```bash -cd /home/hosseinkh/forge -jupyter notebook apps/sft_v2/sft_training_notebook.ipynb -``` - -Or in VS Code: -- Open `apps/sft_v2/sft_training_notebook.ipynb` -- Select Python kernel -- Run cells sequentially - -### 2. Configure Training - -The notebook is organized into sections: - -1. **๐Ÿ“ฆ Model Configuration** - Choose model and path -2. **โš™๏ธ Training Configuration** - Set hyperparameters -3. **๐Ÿ”ง Optimizer Configuration** - Configure optimizer and LR scheduler -4. **๐Ÿ”€ Parallelism Configuration** - Set distributed training strategy -5. **๐Ÿ’พ Checkpoint Configuration** - Configure checkpointing -6. **๐Ÿ–ฅ๏ธ Resource Configuration** - Set number of GPUs/nodes -7. **โ˜๏ธ Provisioner Configuration** (optional) - For multi-node SLURM - -### 3. Run Training - -Execute the "Run Training!" cell to start training with your configuration. - -## ๐Ÿ“š Using the Utility Library - -The `notebook_utils.py` module provides a clean API for training: - -### Configuration Builders - -```python -from apps.sft_v2 import notebook_utils as nb - -# Create model config -model_config = nb.create_model_config( - name="llama3", - flavor="8B", - hf_assets_path="/path/to/model" -) - -# Create training config -training_config = nb.create_training_config( - steps=1000, - local_batch_size=1, - seq_len=2048 -) - -# Create optimizer config -optimizer_config = nb.create_optimizer_config( - name="AdamW", - lr=1e-5 -) - -# ... configure other components - -# Build complete config -config = nb.build_config( - model_config=model_config, - training_config=training_config, - optimizer_config=optimizer_config, - # ... other configs -) -``` - -### Training Functions - -```python -# Simple: run everything -nb.train(config) - -# Advanced: step-by-step control -import asyncio - -async def custom_training(): - # Initialize - await nb.initialize_provisioner(config) - - # Create and setup - recipe = await nb.create_recipe(config) - await nb.setup_recipe(recipe) - - # Train - await nb.train_recipe(recipe) - - # Cleanup - await nb.cleanup_recipe(recipe) - await nb.shutdown_provisioner(config) - -asyncio.run(custom_training()) -``` - -### Display Utilities - -```python -# Print summary -nb.summarize_config(config) - -# Print full YAML -nb.print_config(config, title="My Config") -``` - -## ๐Ÿ”ง Configuration Functions Reference - -### Model Configuration - -```python -nb.create_model_config( - name: str = "llama3", - flavor: str = "8B", - hf_assets_path: str = "/tmp/Meta-Llama-3.1-8B-Instruct" -) -``` - -### Training Configuration - -```python -nb.create_training_config( - local_batch_size: int = 1, - seq_len: int = 2048, - max_norm: float = 1.0, - steps: int = 1000, - dataset: str = "c4", - compile: bool = False -) -``` - -### Optimizer Configuration - -```python -nb.create_optimizer_config( - name: str = "AdamW", - lr: float = 1e-5, - eps: float = 1e-8, - weight_decay: float = 0.0, - betas: tuple = (0.9, 0.999) -) -``` - -### LR Scheduler Configuration - -```python -nb.create_lr_scheduler_config( - warmup_steps: int = 200, - decay_steps: Optional[int] = None, - min_lr: float = 0.0 -) -``` - -### Parallelism Configuration - -```python -nb.create_parallelism_config( - data_parallel_replicate_degree: int = 1, - data_parallel_shard_degree: int = -1, # -1 = auto (FSDP) - tensor_parallel_degree: int = 1, - pipeline_parallel_degree: int = 1, - context_parallel_degree: int = 1, - expert_parallel_degree: int = 1, - disable_loss_parallel: bool = False -) -``` - -### Checkpoint Configuration - -```python -nb.create_checkpoint_config( - enable: bool = True, - folder: str = "/tmp/checkpoints", - initial_load_path: Optional[str] = None, - initial_load_in_hf: bool = True, - last_save_in_hf: bool = True, - interval: int = 500, - async_mode: str = "disabled" -) -``` - -### Activation Checkpoint Configuration - -```python -nb.create_activation_checkpoint_config( - mode: str = "selective", # 'selective', 'full', 'none' - selective_ac_option: str = "op" -) -``` - -### Process Configuration - -```python -# Single node -nb.create_process_config( - procs: int = 8, - with_gpus: bool = True, - hosts: Optional[int] = None -) - -# Multi-node -nb.create_process_config( - procs: int = 8, - with_gpus: bool = True, - hosts: int = 4 # 4 nodes -) -``` - -### Provisioner Configuration (Multi-Node Only) - -```python -nb.create_provisioner_config( - launcher: str = "slurm", - job_name: str = "sft_training", - partition: Optional[str] = None, - time: Optional[str] = None, - account: Optional[str] = None -) -``` - -## ๐Ÿ“– Example Configurations - -### Quick Test (Single GPU, 10 steps) - -```python -model_config = nb.create_model_config( - name="llama3", - flavor="8B", - hf_assets_path="/path/to/model" -) - -training_config = nb.create_training_config( - steps=10, - local_batch_size=1 -) - -process_config = nb.create_process_config(procs=1) - -# ... configure other components with defaults -``` - -### Single Node, 8 GPUs, FSDP - -```python -parallelism_config = nb.create_parallelism_config( - data_parallel_shard_degree=-1 # Use all 8 GPUs with FSDP -) - -process_config = nb.create_process_config(procs=8) - -# No provisioner needed -provisioner_config = None -``` - -### Multi-Node, 4ร—8 GPUs, Tensor Parallel - -```python -parallelism_config = nb.create_parallelism_config( - data_parallel_shard_degree=16, # 32 GPUs / 2 TP = 16 FSDP - tensor_parallel_degree=2 -) - -process_config = nb.create_process_config( - procs=8, - hosts=4 -) - -provisioner_config = nb.create_provisioner_config( - launcher="slurm", - job_name="sft_multinode", - partition="gpu_partition", - time="24:00:00" -) -``` - -## ๐ŸŽ“ Advanced Usage - -### Custom Training Loop - -You can modify the training loop by creating your own recipe class: - -```python -from apps.sft_v2.main import ForgeSFTRecipe - -class CustomRecipe(ForgeSFTRecipe): - async def train(self): - # Custom training logic - dataloader = iter(self.train_dataloader) - - for step in range(self.num_training_steps): - batch = next(dataloader) - # Custom batch processing - self.train_step(batch) -``` - -### Experiment Tracking - -Integrate with your favorite tracking tool: - -```python -import wandb - -# Initialize tracking -wandb.init(project="sft-training", config=config) - -# Train -nb.train(config) - -# Log results -wandb.log({"final_step": config.training.steps}) -``` - -### Config Variations - -Generate multiple configs for hyperparameter sweeps: - -```python -learning_rates = [1e-5, 5e-5, 1e-4] -configs = [] - -for lr in learning_rates: - optimizer_config = nb.create_optimizer_config(lr=lr) - config = nb.build_config( - # ... other configs - optimizer_config=optimizer_config - ) - configs.append(config) - -# Train all configs -for config in configs: - nb.train(config) -``` - -## ๐Ÿ” Debugging Tips - -### Start Simple - -1. **Use 1 GPU first**: - ```python - process_config = nb.create_process_config(procs=1) - ``` - -2. **Run few steps**: - ```python - training_config = nb.create_training_config(steps=10) - ``` - -3. **Disable compilation**: - ```python - training_config = nb.create_training_config(compile=False) - ``` - -### Common Issues - -**Memory Errors:** -- Reduce batch size or sequence length -- Enable FSDP: `data_parallel_shard_degree=-1` -- Enable activation checkpointing: `mode="selective"` or `"full"` - -**Slow Training:** -- Increase batch size if memory allows -- Enable compilation: `compile=True` -- Use tensor parallelism for large models - -**Actor Timeout Errors:** -- Make sure you're not using provisioner config on single node -- Check SLURM availability with `sinfo` -- See `TROUBLESHOOTING_MULTINODE.md` for details - -## ๐Ÿ“ฆ Saving and Loading Configs - -### Save Config - -```python -from omegaconf import OmegaConf - -config_path = "my_config.yaml" -with open(config_path, 'w') as f: - OmegaConf.save(config, f) -``` - -### Load Config - -```python -from omegaconf import OmegaConf - -config = OmegaConf.load("my_config.yaml") -nb.train(config) -``` - -## ๐Ÿš€ Next Steps - -1. **Start with the notebook**: Open `sft_training_notebook.ipynb` and follow along -2. **Try a test run**: Configure for 10 steps with 1 GPU -3. **Scale up**: Increase to 8 GPUs with FSDP -4. **Go multi-node**: Configure SLURM provisioner for cluster training - -## ๐Ÿ“š Additional Resources - -- **`MULTINODE_SFT_V2_GUIDE.md`** - Detailed guide on multi-node training -- **`TROUBLESHOOTING_MULTINODE.md`** - Troubleshooting guide for multi-node issues -- **`main.py`** - Original implementation for reference - -## ๐Ÿค Contributing - -To add new configuration options: - -1. Add a `create_*_config()` function in `notebook_utils.py` -2. Update `build_config()` to include the new config -3. Add a new cell in the notebook to configure it -4. Update this README - -## โš–๏ธ License - -Copyright (c) Meta Platforms, Inc. and affiliates. - -Licensed under the BSD-style license found in the LICENSE file. diff --git a/apps/sft_v2/actor.py b/apps/sft_v2/actor.py new file mode 100644 index 000000000..8607a39c4 --- /dev/null +++ b/apps/sft_v2/actor.py @@ -0,0 +1,133 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Abstract Actor class for training/inference actors in Forge. + +This provides a base class that can be extended for different types of actors +(e.g., Trainer, Evaluator, Inferencer, etc.) +""" + +import logging +import math +import os +from abc import ABC, abstractmethod +from typing import Any, Optional + +import torch +from forge.controller import ForgeActor +from monarch.actor import current_rank, current_size +from omegaconf import DictConfig, OmegaConf +from torch import nn +from torchtitan.components.loss import LossFunction +from torchtitan.components.lr_scheduler import LRSchedulersContainer +from torchtitan.components.optimizer import OptimizersContainer +from torchtitan.distributed import ParallelDims +from torchtitan.experiments.forge.engine import ForgeEngine +from torchtitan.experiments.forge.job_config import ForgeJobConfig + +Checkpointer = Any +Dataloader = Any +MetricLogger = Any +Profiler = Any +Tokenizer = Any + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +class BaseForgeActor(ForgeActor, ForgeEngine, ABC): + """ + Abstract base class for Forge actors. + + This class handles common initialization, distributed setup, and provides + abstract methods that must be implemented by concrete actor classes. + """ + + job_config: ForgeJobConfig + parallel_dims: ParallelDims + model: list[nn.Module] + loss_fn: Optional[LossFunction] + optimizer: Optional[OptimizersContainer] + lr_scheduler: Optional[LRSchedulersContainer] + checkpointer: Optional[Checkpointer] + tokenizer: Optional[Tokenizer] + metric_logger: Optional[MetricLogger] + profiler: Optional[Profiler] + device: torch.device + + def __init__(self, config: DictConfig): + """ + Initialize the base actor with configuration. + + Args: + config: Configuration dictionary containing job settings + """ + job_config = ForgeJobConfig().to_dict() + job_config = OmegaConf.merge(job_config, config) + + self.current_step = 0 + self.metric_logger = None + self.gradient_accumulation_steps = 1 + self._rank = current_rank().rank + self._size = math.prod(current_size().values()) + + self._init_dist() + super().__init__(job_config) + + def _init_dist(self): + """ + Initialize torch distributed environment. + + Sets up environment variables required for distributed training + in the Monarch actor framework. + """ + env = { + "RANK": str(self._rank), + "LOCAL_RANK": str(self._rank), + "LOCAL_WORLD_SIZE": str(self._size), + "GROUP_RANK": str(self._size), + "GROUP_WORLD_SIZE": str(self._size), + "ROLE_RANK": str(self._rank), + "ROLE_WORLD_SIZE": str(self._size), + "ROLE_NAME": "rank", + "WORLD_SIZE": str(self._size), + "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", + } + os.environ.update(env) + logger.info(f"Initialized distributed environment: {env}") + + @abstractmethod + async def setup(self): + """ + Setup the actor (load data, checkpoint, etc.). + + This method must be implemented by concrete actor classes. + """ + pass + + @abstractmethod + async def run(self): + """ + Main execution logic for the actor. + + This method must be implemented by concrete actor classes. + """ + pass + + @abstractmethod + async def cleanup(self): + """ + Cleanup resources (close checkpointer, logger, etc.). + + This method must be implemented by concrete actor classes. + """ + pass + + @abstractmethod + def __repr__(self) -> str: + """String representation of the actor.""" + pass diff --git a/apps/sft_v2/interactive_config_notebook.ipynb b/apps/sft_v2/interactive_config_notebook.ipynb new file mode 100644 index 000000000..624f6a08a --- /dev/null +++ b/apps/sft_v2/interactive_config_notebook.ipynb @@ -0,0 +1,629 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SFT Training - Interactive Configuration Notebook\n", + "\n", + "This notebook allows you to configure and run SFT training **without any YAML files**!\n", + "\n", + "## Benefits\n", + "\n", + "โœ… No external YAML files needed \n", + "โœ… Interactive configuration in separate cells \n", + "โœ… Easy to modify and experiment \n", + "โœ… All configuration visible in notebook \n", + "โœ… Quick templates for common scenarios" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Import Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import asyncio\n", + "import logging\n", + "from omegaconf import OmegaConf, DictConfig\n", + "\n", + "from forge.apps.sft_v2.trainer_actor import TrainerActor\n", + "from forge.apps.sft_v2.spawn_actor import SpawnActor, run_actor\n", + "\n", + "logging.basicConfig(\n", + " level=logging.INFO,\n", + " format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Configure Model Settings\n", + "\n", + "Define your model configuration. **Modify these values as needed!**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_config = {\n", + " \"name\": \"llama3\",\n", + " \"flavor\": \"8B\",\n", + " \"hf_assets_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct\"\n", + "}\n", + "\n", + "print(\"Model Configuration:\")\n", + "print(OmegaConf.to_yaml(OmegaConf.create(model_config)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3: Configure Process Settings\n", + "\n", + "Define how many processes to use and whether to use GPUs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "processes_config = {\n", + " \"procs\": 8, # Number of processes\n", + " \"with_gpus\": True # Use GPUs\n", + "}\n", + "\n", + "print(\"Process Configuration:\")\n", + "print(OmegaConf.to_yaml(OmegaConf.create(processes_config)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4: Configure Optimizer Settings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "optimizer_config = {\n", + " \"name\": \"AdamW\",\n", + " \"lr\": 1e-5, # Learning rate\n", + " \"eps\": 1e-8\n", + "}\n", + "\n", + "print(\"Optimizer Configuration:\")\n", + "print(OmegaConf.to_yaml(OmegaConf.create(optimizer_config)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5: Configure Learning Rate Scheduler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lr_scheduler_config = {\n", + " \"warmup_steps\": 200 # Number of warmup steps\n", + "}\n", + "\n", + "print(\"LR Scheduler Configuration:\")\n", + "print(OmegaConf.to_yaml(OmegaConf.create(lr_scheduler_config)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 6: Configure Training Settings\n", + "\n", + "**Key parameters to adjust for your experiment:**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "training_config = {\n", + " \"local_batch_size\": 1, # Batch size per GPU\n", + " \"seq_len\": 2048, # Sequence length\n", + " \"max_norm\": 1.0, # Gradient clipping\n", + " \"steps\": 1000, # Total training steps\n", + " \"compile\": False, # PyTorch compilation\n", + " \"dataset\": \"c4\" # Dataset name\n", + "}\n", + "\n", + "print(\"Training Configuration:\")\n", + "print(OmegaConf.to_yaml(OmegaConf.create(training_config)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 7: Configure Parallelism Settings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "parallelism_config = {\n", + " \"data_parallel_replicate_degree\": 1,\n", + " \"data_parallel_shard_degree\": -1, # -1 means use all available GPUs for FSDP\n", + " \"tensor_parallel_degree\": 1,\n", + " \"pipeline_parallel_degree\": 1,\n", + " \"context_parallel_degree\": 1,\n", + " \"expert_parallel_degree\": 1,\n", + " \"disable_loss_parallel\": False\n", + "}\n", + "\n", + "print(\"Parallelism Configuration:\")\n", + "print(OmegaConf.to_yaml(OmegaConf.create(parallelism_config)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 8: Configure Checkpoint Settings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "checkpoint_config = {\n", + " \"enable\": True,\n", + " \"folder\": \"/tmp/Meta-Llama-3.1-8B-Instruct/saved_checkpoints\",\n", + " \"initial_load_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct/\",\n", + " \"initial_load_in_hf\": True,\n", + " \"last_save_in_hf\": True,\n", + " \"interval\": 500, # Save every N steps\n", + " \"async_mode\": \"disabled\"\n", + "}\n", + "\n", + "print(\"Checkpoint Configuration:\")\n", + "print(OmegaConf.to_yaml(OmegaConf.create(checkpoint_config)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 9: Configure Activation Checkpointing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "activation_checkpoint_config = {\n", + " \"mode\": \"selective\",\n", + " \"selective_ac_option\": \"op\"\n", + "}\n", + "\n", + "print(\"Activation Checkpoint Configuration:\")\n", + "print(OmegaConf.to_yaml(OmegaConf.create(activation_checkpoint_config)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 10: Configure Communication Settings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "comm_config = {\n", + " \"trace_buf_size\": 0\n", + "}\n", + "\n", + "print(\"Communication Configuration:\")\n", + "print(OmegaConf.to_yaml(OmegaConf.create(comm_config)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 11: Combine All Configurations\n", + "\n", + "Now let's merge everything into a complete configuration!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Combine all configs\n", + "complete_config = {\n", + " \"comm\": comm_config,\n", + " \"model\": model_config,\n", + " \"processes\": processes_config,\n", + " \"optimizer\": optimizer_config,\n", + " \"lr_scheduler\": lr_scheduler_config,\n", + " \"training\": training_config,\n", + " \"parallelism\": parallelism_config,\n", + " \"checkpoint\": checkpoint_config,\n", + " \"activation_checkpoint\": activation_checkpoint_config\n", + "}\n", + "\n", + "# Create OmegaConf DictConfig\n", + "cfg = OmegaConf.create(complete_config)\n", + "\n", + "print(\"=\" * 80)\n", + "print(\"COMPLETE CONFIGURATION\")\n", + "print(\"=\" * 80)\n", + "print(OmegaConf.to_yaml(cfg))\n", + "print(\"=\" * 80)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 12: Run Training (Simple Way)\n", + "\n", + "The simplest way - automatic lifecycle management!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run training with automatic lifecycle management\n", + "await run_actor(TrainerActor, cfg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Alternative: Manual Lifecycle Control\n", + "\n", + "For more control, manage each phase separately.\n", + "\n", + "### Create and Spawn the Actor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create the spawner\n", + "spawner = SpawnActor(TrainerActor, cfg)\n", + "\n", + "# Spawn the actor\n", + "actor = await spawner.spawn()\n", + "print(f\"โœ“ Actor spawned: {actor}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setup the Actor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Setup (load data, checkpoints, etc.)\n", + "await spawner.setup()\n", + "print(\"โœ“ Actor setup complete\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run training\n", + "await spawner.run()\n", + "print(\"โœ“ Training complete\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cleanup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Cleanup resources\n", + "await spawner.cleanup()\n", + "print(\"โœ“ Cleanup complete\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Quick Configuration Templates\n", + "\n", + "Here are ready-to-use templates for common scenarios!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Template 1: Quick Test (Single GPU, Small Steps)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "quick_test_config = OmegaConf.create({\n", + " \"comm\": {\"trace_buf_size\": 0},\n", + " \"model\": {\n", + " \"name\": \"llama3\",\n", + " \"flavor\": \"8B\",\n", + " \"hf_assets_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct\"\n", + " },\n", + " \"processes\": {\"procs\": 1, \"with_gpus\": True},\n", + " \"optimizer\": {\"name\": \"AdamW\", \"lr\": 1e-5, \"eps\": 1e-8},\n", + " \"lr_scheduler\": {\"warmup_steps\": 10},\n", + " \"training\": {\n", + " \"local_batch_size\": 1,\n", + " \"seq_len\": 1024,\n", + " \"max_norm\": 1.0,\n", + " \"steps\": 100, # Just 100 steps for quick testing\n", + " \"compile\": False,\n", + " \"dataset\": \"c4\"\n", + " },\n", + " \"parallelism\": {\n", + " \"data_parallel_replicate_degree\": 1,\n", + " \"data_parallel_shard_degree\": 1,\n", + " \"tensor_parallel_degree\": 1,\n", + " \"pipeline_parallel_degree\": 1,\n", + " \"context_parallel_degree\": 1,\n", + " \"expert_parallel_degree\": 1,\n", + " \"disable_loss_parallel\": False\n", + " },\n", + " \"checkpoint\": {\n", + " \"enable\": True,\n", + " \"folder\": \"/tmp/quick_test_checkpoints\",\n", + " \"initial_load_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct/\",\n", + " \"initial_load_in_hf\": True,\n", + " \"last_save_in_hf\": True,\n", + " \"interval\": 50,\n", + " \"async_mode\": \"disabled\"\n", + " },\n", + " \"activation_checkpoint\": {\n", + " \"mode\": \"selective\",\n", + " \"selective_ac_option\": \"op\"\n", + " }\n", + "})\n", + "\n", + "print(\"Quick Test Configuration:\")\n", + "print(OmegaConf.to_yaml(quick_test_config))\n", + "\n", + "# To use: await run_actor(TrainerActor, quick_test_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Template 2: Multi-GPU Training (8 GPUs with FSDP)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "multi_gpu_config = OmegaConf.create({\n", + " \"comm\": {\"trace_buf_size\": 0},\n", + " \"model\": {\n", + " \"name\": \"llama3\",\n", + " \"flavor\": \"8B\",\n", + " \"hf_assets_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct\"\n", + " },\n", + " \"processes\": {\"procs\": 8, \"with_gpus\": True},\n", + " \"optimizer\": {\"name\": \"AdamW\", \"lr\": 2e-5, \"eps\": 1e-8},\n", + " \"lr_scheduler\": {\"warmup_steps\": 200},\n", + " \"training\": {\n", + " \"local_batch_size\": 2,\n", + " \"seq_len\": 2048,\n", + " \"max_norm\": 1.0,\n", + " \"steps\": 5000,\n", + " \"compile\": False,\n", + " \"dataset\": \"c4\"\n", + " },\n", + " \"parallelism\": {\n", + " \"data_parallel_replicate_degree\": 1,\n", + " \"data_parallel_shard_degree\": 8, # FSDP across 8 GPUs\n", + " \"tensor_parallel_degree\": 1,\n", + " \"pipeline_parallel_degree\": 1,\n", + " \"context_parallel_degree\": 1,\n", + " \"expert_parallel_degree\": 1,\n", + " \"disable_loss_parallel\": False\n", + " },\n", + " \"checkpoint\": {\n", + " \"enable\": True,\n", + " \"folder\": \"/tmp/multi_gpu_checkpoints\",\n", + " \"initial_load_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct/\",\n", + " \"initial_load_in_hf\": True,\n", + " \"last_save_in_hf\": True,\n", + " \"interval\": 500,\n", + " \"async_mode\": \"disabled\"\n", + " },\n", + " \"activation_checkpoint\": {\n", + " \"mode\": \"selective\",\n", + " \"selective_ac_option\": \"op\"\n", + " }\n", + "})\n", + "\n", + "print(\"Multi-GPU Configuration:\")\n", + "print(OmegaConf.to_yaml(multi_gpu_config))\n", + "\n", + "# To use: await run_actor(TrainerActor, multi_gpu_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Template 3: Memory-Efficient Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "memory_efficient_config = OmegaConf.create({\n", + " \"comm\": {\"trace_buf_size\": 0},\n", + " \"model\": {\n", + " \"name\": \"llama3\",\n", + " \"flavor\": \"8B\",\n", + " \"hf_assets_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct\"\n", + " },\n", + " \"processes\": {\"procs\": 4, \"with_gpus\": True},\n", + " \"optimizer\": {\"name\": \"AdamW\", \"lr\": 1e-5, \"eps\": 1e-8},\n", + " \"lr_scheduler\": {\"warmup_steps\": 150},\n", + " \"training\": {\n", + " \"local_batch_size\": 1, # Small batch size\n", + " \"seq_len\": 1024, # Shorter sequence\n", + " \"max_norm\": 1.0,\n", + " \"steps\": 2000,\n", + " \"compile\": False,\n", + " \"dataset\": \"c4\"\n", + " },\n", + " \"parallelism\": {\n", + " \"data_parallel_replicate_degree\": 1,\n", + " \"data_parallel_shard_degree\": 4,\n", + " \"tensor_parallel_degree\": 1,\n", + " \"pipeline_parallel_degree\": 1,\n", + " \"context_parallel_degree\": 1,\n", + " \"expert_parallel_degree\": 1,\n", + " \"disable_loss_parallel\": False\n", + " },\n", + " \"checkpoint\": {\n", + " \"enable\": True,\n", + " \"folder\": \"/tmp/memory_efficient_checkpoints\",\n", + " \"initial_load_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct/\",\n", + " \"initial_load_in_hf\": True,\n", + " \"last_save_in_hf\": True,\n", + " \"interval\": 400,\n", + " \"async_mode\": \"disabled\"\n", + " },\n", + " \"activation_checkpoint\": {\n", + " \"mode\": \"selective\", # Saves memory\n", + " \"selective_ac_option\": \"op\"\n", + " }\n", + "})\n", + "\n", + "print(\"Memory-Efficient Configuration:\")\n", + "print(OmegaConf.to_yaml(memory_efficient_config))\n", + "\n", + "# To use: await run_actor(TrainerActor, memory_efficient_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Tips & Tricks\n", + "\n", + "## Memory Optimization\n", + "- โฌ‡๏ธ Reduce `seq_len` if running out of memory\n", + "- โฌ‡๏ธ Reduce `local_batch_size` if running out of memory\n", + "- โœ… Enable `activation_checkpoint` for memory savings\n", + "\n", + "## Training Speed\n", + "- โฌ†๏ธ Increase `local_batch_size` for faster training (if memory allows)\n", + "- ๐Ÿš€ Use multiple GPUs with FSDP (`data_parallel_shard_degree > 1`)\n", + "- โšก Enable `compile: true` for PyTorch compilation (experimental)\n", + "\n", + "## Debugging\n", + "- ๐Ÿงช Start with small `steps` (e.g., 10-100) to test quickly\n", + "- ๐Ÿ” Use single GPU first (`procs: 1`)\n", + "- ๐Ÿ“Š Monitor loss values in logs\n", + "\n", + "## Checkpoint Management\n", + "- ๐Ÿ’พ Set `interval` based on how often you want to save\n", + "- ๐Ÿ“ Ensure `folder` path exists and has enough space\n", + "- ๐Ÿ”„ Use `initial_load_path` to resume from checkpoints" + ] + } + ], + "metadata": { + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/apps/sft_v2/notebook_utils.py b/apps/sft_v2/notebook_utils.py deleted file mode 100644 index b3636fd26..000000000 --- a/apps/sft_v2/notebook_utils.py +++ /dev/null @@ -1,463 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -""" -Utility functions for notebook-based SFT training. -This module provides a clean API for interactive training in Jupyter notebooks. -""" - -import asyncio -import logging -from typing import Any, Dict, Optional - -import torch - -from apps.sft_v2.main import ForgeSFTRecipe -from omegaconf import DictConfig, OmegaConf - -logger = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) - - -# ============================================================================ -# Configuration Builders -# ============================================================================ - - -def create_model_config( - name: str = "llama3", - flavor: str = "8B", - hf_assets_path: str = "/tmp/Meta-Llama-3.1-8B-Instruct", -) -> Dict[str, Any]: - """ - Create model configuration. - - Args: - name: Model architecture name (e.g., 'llama3', 'llama2') - flavor: Model size (e.g., '8B', '70B') - hf_assets_path: Path to HuggingFace model assets - - Returns: - Dictionary with model configuration - """ - return { - "name": name, - "flavor": flavor, - "hf_assets_path": hf_assets_path, - } - - -def create_optimizer_config( - name: str = "AdamW", - lr: float = 1e-5, - eps: float = 1e-8, - weight_decay: float = 0.0, - betas: tuple = (0.9, 0.999), -) -> Dict[str, Any]: - """ - Create optimizer configuration. - - Args: - name: Optimizer name (e.g., 'AdamW', 'Adam', 'SGD') - lr: Learning rate - eps: Epsilon for numerical stability - weight_decay: L2 regularization coefficient - betas: Coefficients for computing running averages - - Returns: - Dictionary with optimizer configuration - """ - return { - "name": name, - "lr": lr, - "eps": eps, - "weight_decay": weight_decay, - "betas": list(betas), - } - - -def create_lr_scheduler_config( - warmup_steps: int = 200, - decay_steps: Optional[int] = None, - min_lr: float = 0.0, -) -> Dict[str, Any]: - """ - Create learning rate scheduler configuration. - - Args: - warmup_steps: Number of warmup steps - decay_steps: Number of decay steps (None = no decay) - min_lr: Minimum learning rate - - Returns: - Dictionary with LR scheduler configuration - """ - config = {"warmup_steps": warmup_steps} - if decay_steps is not None: - config["decay_steps"] = decay_steps - if min_lr > 0: - config["min_lr"] = min_lr - return config - - -def create_training_config( - local_batch_size: int = 1, - seq_len: int = 2048, - max_norm: float = 1.0, - steps: int = 1000, - dataset: str = "c4", - compile: bool = False, -) -> Dict[str, Any]: - """ - Create training configuration. - - Args: - local_batch_size: Batch size per GPU - seq_len: Sequence length - max_norm: Gradient clipping max norm - steps: Total training steps - dataset: Dataset name - compile: Whether to use torch.compile - - Returns: - Dictionary with training configuration - """ - return { - "local_batch_size": local_batch_size, - "seq_len": seq_len, - "max_norm": max_norm, - "steps": steps, - "dataset": dataset, - "compile": compile, - } - - -def create_parallelism_config( - data_parallel_replicate_degree: int = 1, - data_parallel_shard_degree: int = -1, - tensor_parallel_degree: int = 1, - pipeline_parallel_degree: int = 1, - context_parallel_degree: int = 1, - expert_parallel_degree: int = 1, - disable_loss_parallel: bool = False, -) -> Dict[str, Any]: - """ - Create parallelism configuration. - - Args: - data_parallel_replicate_degree: Data parallel replication - data_parallel_shard_degree: Data parallel sharding (FSDP), -1 = auto - tensor_parallel_degree: Tensor parallelism degree - pipeline_parallel_degree: Pipeline parallelism degree - context_parallel_degree: Context parallelism degree - expert_parallel_degree: Expert parallelism degree (for MoE) - disable_loss_parallel: Whether to disable loss parallelism - - Returns: - Dictionary with parallelism configuration - """ - return { - "data_parallel_replicate_degree": data_parallel_replicate_degree, - "data_parallel_shard_degree": data_parallel_shard_degree, - "tensor_parallel_degree": tensor_parallel_degree, - "pipeline_parallel_degree": pipeline_parallel_degree, - "context_parallel_degree": context_parallel_degree, - "expert_parallel_degree": expert_parallel_degree, - "disable_loss_parallel": disable_loss_parallel, - } - - -def create_checkpoint_config( - enable: bool = True, - folder: str = "/tmp/checkpoints", - initial_load_path: Optional[str] = None, - initial_load_in_hf: bool = True, - last_save_in_hf: bool = True, - interval: int = 500, - async_mode: str = "disabled", -) -> Dict[str, Any]: - """ - Create checkpoint configuration. - - Args: - enable: Whether to enable checkpointing - folder: Path to save checkpoints - initial_load_path: Path to load initial checkpoint from - initial_load_in_hf: Load initial checkpoint in HF format - last_save_in_hf: Save last checkpoint in HF format - interval: Steps between checkpoints - async_mode: Async checkpoint mode ('disabled', 'async', etc.) - - Returns: - Dictionary with checkpoint configuration - """ - return { - "enable": enable, - "folder": folder, - "initial_load_path": initial_load_path, - "initial_load_in_hf": initial_load_in_hf, - "last_save_in_hf": last_save_in_hf, - "interval": interval, - "async_mode": async_mode, - } - - -def create_activation_checkpoint_config( - mode: str = "selective", - selective_ac_option: str = "op", -) -> Dict[str, Any]: - """ - Create activation checkpointing configuration. - - Args: - mode: Activation checkpoint mode ('selective', 'full', 'none') - selective_ac_option: Selective AC option ('op', 'layer', etc.) - - Returns: - Dictionary with activation checkpoint configuration - """ - return { - "mode": mode, - "selective_ac_option": selective_ac_option, - } - - -def create_process_config( - procs: int = 8, - with_gpus: bool = True, - hosts: Optional[int] = None, -) -> Dict[str, Any]: - """ - Create process configuration. - - Args: - procs: Number of processes per host - with_gpus: Whether to use GPUs - hosts: Number of hosts (None = single node) - - Returns: - Dictionary with process configuration - """ - config = { - "procs": procs, - "with_gpus": with_gpus, - } - if hosts is not None: - config["hosts"] = hosts - return config - - -# ============================================================================ -# Configuration Assembly -# ============================================================================ - - -def build_config( - model_config: Dict[str, Any], - optimizer_config: Dict[str, Any], - lr_scheduler_config: Dict[str, Any], - training_config: Dict[str, Any], - parallelism_config: Dict[str, Any], - checkpoint_config: Dict[str, Any], - activation_checkpoint_config: Dict[str, Any], - process_config: Dict[str, Any], -) -> DictConfig: - """ - Build complete configuration from component configs. - - Args: - model_config: Model configuration - optimizer_config: Optimizer configuration - lr_scheduler_config: LR scheduler configuration - training_config: Training configuration - parallelism_config: Parallelism configuration - checkpoint_config: Checkpoint configuration - activation_checkpoint_config: Activation checkpoint configuration - process_config: Process configuration - - Returns: - Complete OmegaConf DictConfig - """ - config = { - "comm": {"trace_buf_size": 0}, - "model": model_config, - "optimizer": optimizer_config, - "lr_scheduler": lr_scheduler_config, - "training": training_config, - "parallelism": parallelism_config, - "checkpoint": checkpoint_config, - "activation_checkpoint": activation_checkpoint_config, - "processes": process_config, - } - - return OmegaConf.create(config) - - -# ============================================================================ -# Training Functions -# ============================================================================ - - -async def create_recipe(config: DictConfig): - """ - Create and return a ForgeSFTRecipe actor. - - Args: - config: Complete configuration - - Returns: - ForgeSFTRecipe actor instance - """ - process_cfg = config.pop("processes") - recipe = await ForgeSFTRecipe.options(**process_cfg).as_actor(config) - logger.info("Recipe created successfully") - return recipe - - -async def setup_recipe(recipe): - """ - Setup the recipe (load model, initialize data loaders, etc.). - - Args: - recipe: ForgeSFTRecipe actor instance - """ - logger.info("Setting up recipe...") - await recipe.setup.call() - logger.info("Recipe setup complete") - - -async def train_recipe(recipe): - """ - Run training on the recipe. - - Args: - recipe: ForgeSFTRecipe actor instance - """ - logger.info("Starting training...") - await recipe.train.call() - logger.info("Training complete") - - -async def cleanup_recipe(recipe): - """ - Cleanup recipe resources. - - Args: - recipe: ForgeSFTRecipe actor instance - """ - logger.info("Cleaning up...") - await recipe.cleanup.call() - await recipe.mesh.stop() - logger.info("Cleanup complete") - - -# ============================================================================ -# High-Level Training API -# ============================================================================ - - -async def run_training(config: DictConfig): - """ - Run complete training pipeline with the given configuration. - - Args: - config: Complete configuration - - Raises: - Exception: If training fails - """ - # Create recipe - recipe = await create_recipe(config) - - # Setup - await setup_recipe(recipe) - - # Train - await train_recipe(recipe) - - # Cleanup - await cleanup_recipe(recipe) - - -def train(config: DictConfig): - """ - Synchronous wrapper for run_training. - - Args: - config: Complete configuration - """ - asyncio.run(run_training(config)) - - -# ============================================================================ -# Display Utilities -# ============================================================================ - - -def print_config(config: DictConfig, title: str = "Configuration"): - """ - Pretty print configuration. - - Args: - config: Configuration to print - title: Title for the output - """ - print(f"\n{'='*60}") - print(f"{title:^60}") - print(f"{'='*60}") - print(OmegaConf.to_yaml(config)) - print(f"{'='*60}\n") - - -def summarize_config(config: DictConfig): - """ - Print a summary of the configuration. - - Args: - config: Configuration to summarize - """ - print("\n" + "=" * 60) - print("Configuration Summary".center(60)) - print("=" * 60) - - print(f"\n๐Ÿ“ฆ Model:") - print(f" โ€ข Name: {config.model.name}") - print(f" โ€ข Flavor: {config.model.flavor}") - print(f" โ€ข Path: {config.model.hf_assets_path}") - - print(f"\nโš™๏ธ Training:") - print(f" โ€ข Steps: {config.training.steps}") - print(f" โ€ข Batch Size: {config.training.local_batch_size}") - print(f" โ€ข Sequence Length: {config.training.seq_len}") - print(f" โ€ข Dataset: {config.training.dataset}") - - print(f"\n๐Ÿ”ง Optimizer:") - print(f" โ€ข Name: {config.optimizer.name}") - print(f" โ€ข Learning Rate: {config.optimizer.lr}") - print(f" โ€ข Warmup Steps: {config.lr_scheduler.warmup_steps}") - - print(f"\n๐Ÿ”€ Parallelism:") - print( - f" โ€ข Data Parallel (Replicate): {config.parallelism.data_parallel_replicate_degree}" - ) - print( - f" โ€ข Data Parallel (Shard/FSDP): {config.parallelism.data_parallel_shard_degree}" - ) - print(f" โ€ข Tensor Parallel: {config.parallelism.tensor_parallel_degree}") - print(f" โ€ข Pipeline Parallel: {config.parallelism.pipeline_parallel_degree}") - - print(f"\n๐Ÿ’พ Checkpointing:") - print(f" โ€ข Enabled: {config.checkpoint.enable}") - print(f" โ€ข Folder: {config.checkpoint.folder}") - print(f" โ€ข Interval: {config.checkpoint.interval} steps") - - print(f"\n๐Ÿ–ฅ๏ธ Resources:") - if "hosts" in config.processes: - print(f" โ€ข Hosts: {config.processes.hosts}") - print(f" โ€ข Processes per host: {config.processes.procs}") - print(f" โ€ข GPUs: {config.processes.with_gpus}") - - print("\n" + "=" * 60 + "\n") diff --git a/apps/sft_v2/sft_training_notebook.ipynb b/apps/sft_v2/sft_training_notebook.ipynb deleted file mode 100644 index 204ec15a9..000000000 --- a/apps/sft_v2/sft_training_notebook.ipynb +++ /dev/null @@ -1,568 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# ๐Ÿš€ SFT Training Notebook\n", - "\n", - "This notebook provides an interactive interface for training Language Models using Supervised Fine-Tuning (SFT).\n", - "\n", - "## Features\n", - "- โœ… Interactive configuration in separate cells\n", - "- โœ… Support for single-node and multi-node training\n", - "- โœ… Easy hyperparameter tuning\n", - "- โœ… Flexible parallelism strategies\n", - "- โœ… Checkpoint management\n", - "\n", - "## Quick Start\n", - "1. Configure each section (model, training, etc.)\n", - "2. Review the complete configuration\n", - "3. Run training!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐Ÿ“š Imports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "sys.path.insert(0, '/home/hosseinkh/forge')\n", - "\n", - "from apps.sft_v2 import notebook_utils as nb\n", - "import torch\n", - "\n", - "print(f\"โœ… Imports successful!\")\n", - "print(f\"๐Ÿ“Š PyTorch version: {torch.__version__}\")\n", - "print(f\"๐ŸŽฎ CUDA available: {torch.cuda.is_available()}\")\n", - "if torch.cuda.is_available():\n", - " print(f\"๐Ÿ”ข Number of GPUs: {torch.cuda.device_count()}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐Ÿ“ฆ Model Configuration\n", - "\n", - "Configure the model you want to train." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Model Configuration\n", - "model_config = nb.create_model_config(\n", - " name=\"llama3\",\n", - " flavor=\"8B\",\n", - " hf_assets_path=\"/mnt/home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct\"\n", - ")\n", - "\n", - "print(\"๐Ÿ“ฆ Model Configuration:\")\n", - "for key, value in model_config.items():\n", - " print(f\" โ€ข {key}: {value}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## โš™๏ธ Training Configuration\n", - "\n", - "Set training hyperparameters." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Training Configuration\n", - "training_config = nb.create_training_config(\n", - " local_batch_size=1, # Batch size per GPU\n", - " seq_len=2048, # Sequence length\n", - " max_norm=1.0, # Gradient clipping\n", - " steps=1000, # Total training steps\n", - " dataset=\"c4\", # Dataset name\n", - " compile=False # Use torch.compile?\n", - ")\n", - "\n", - "print(\"โš™๏ธ Training Configuration:\")\n", - "for key, value in training_config.items():\n", - " print(f\" โ€ข {key}: {value}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐Ÿ”ง Optimizer Configuration\n", - "\n", - "Configure the optimizer and learning rate." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Optimizer Configuration\n", - "optimizer_config = nb.create_optimizer_config(\n", - " name=\"AdamW\",\n", - " lr=1e-5, # Learning rate\n", - " eps=1e-8, # Epsilon\n", - " weight_decay=0.0, # Weight decay\n", - " betas=(0.9, 0.999) # Adam betas\n", - ")\n", - "\n", - "# LR Scheduler Configuration\n", - "lr_scheduler_config = nb.create_lr_scheduler_config(\n", - " warmup_steps=200, # Warmup steps\n", - " decay_steps=None, # Decay steps (None = no decay)\n", - " min_lr=0.0 # Minimum LR\n", - ")\n", - "\n", - "print(\"๐Ÿ”ง Optimizer Configuration:\")\n", - "for key, value in optimizer_config.items():\n", - " print(f\" โ€ข {key}: {value}\")\n", - "\n", - "print(\"\\n๐Ÿ“ˆ LR Scheduler Configuration:\")\n", - "for key, value in lr_scheduler_config.items():\n", - " print(f\" โ€ข {key}: {value}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐Ÿ”€ Parallelism Configuration\n", - "\n", - "Configure distributed training strategies.\n", - "\n", - "### Parallelism Options:\n", - "- **Data Parallel (Replicate)**: Basic data parallelism\n", - "- **Data Parallel (Shard/FSDP)**: Fully Sharded Data Parallel (-1 = use all GPUs)\n", - "- **Tensor Parallel**: Split model across multiple GPUs\n", - "- **Pipeline Parallel**: Split model stages across GPUs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Parallelism Configuration\n", - "parallelism_config = nb.create_parallelism_config(\n", - " data_parallel_replicate_degree=1, # DP replicate\n", - " data_parallel_shard_degree=-1, # FSDP (-1 = auto, uses all GPUs)\n", - " tensor_parallel_degree=1, # TP\n", - " pipeline_parallel_degree=1, # PP\n", - " context_parallel_degree=1, # CP\n", - " expert_parallel_degree=1, # EP (for MoE)\n", - " disable_loss_parallel=False\n", - ")\n", - "\n", - "print(\"๐Ÿ”€ Parallelism Configuration:\")\n", - "for key, value in parallelism_config.items():\n", - " print(f\" โ€ข {key}: {value}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐Ÿ’พ Checkpoint Configuration\n", - "\n", - "Configure model checkpointing." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Checkpoint Configuration\n", - "checkpoint_config = nb.create_checkpoint_config(\n", - " enable=True,\n", - " folder=\"/mnt/home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct/saved_checkpoints\",\n", - " initial_load_path=\"/mnt/home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct/\",\n", - " initial_load_in_hf=True,\n", - " last_save_in_hf=True,\n", - " interval=500, # Save every N steps\n", - " async_mode=\"disabled\"\n", - ")\n", - "\n", - "# Activation Checkpoint Configuration (for memory efficiency)\n", - "activation_checkpoint_config = nb.create_activation_checkpoint_config(\n", - " mode=\"selective\", # 'selective', 'full', or 'none'\n", - " selective_ac_option=\"op\" # 'op' or 'layer'\n", - ")\n", - "\n", - "print(\"๐Ÿ’พ Checkpoint Configuration:\")\n", - "for key, value in checkpoint_config.items():\n", - " print(f\" โ€ข {key}: {value}\")\n", - "\n", - "print(\"\\n๐Ÿ”„ Activation Checkpoint Configuration:\")\n", - "for key, value in activation_checkpoint_config.items():\n", - " print(f\" โ€ข {key}: {value}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐Ÿ–ฅ๏ธ Resource Configuration\n", - "\n", - "Configure compute resources.\n", - "\n", - "### Options:\n", - "- **Single Node**: Set only `procs` (number of GPUs)\n", - "- **Multi Node**: Set both `hosts` (number of nodes) and `procs` (GPUs per node)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Choose ONE of the following:\n", - "\n", - "# Option 1: Single Node (8 GPUs)\n", - "process_config = nb.create_process_config(\n", - " procs=8,\n", - " with_gpus=True,\n", - " hosts=None # None = single node\n", - ")\n", - "\n", - "# Option 2: Multi-Node (4 nodes ร— 8 GPUs = 32 total)\n", - "# Uncomment to use:\n", - "# process_config = nb.create_process_config(\n", - "# procs=8,\n", - "# with_gpus=True,\n", - "# hosts=4\n", - "# )\n", - "\n", - "print(\"๐Ÿ–ฅ๏ธ Resource Configuration:\")\n", - "for key, value in process_config.items():\n", - " print(f\" โ€ข {key}: {value}\")\n", - "\n", - "if \"hosts\" in process_config and process_config[\"hosts\"]:\n", - " total_gpus = process_config[\"hosts\"] * process_config[\"procs\"]\n", - " print(f\"\\n๐Ÿ“Š Total GPUs: {total_gpus}\")\n", - "else:\n", - " print(f\"\\n๐Ÿ“Š Total GPUs: {process_config['procs']}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## โ˜๏ธ Provisioner Configuration (Optional)\n", - "\n", - "**Only needed for multi-node training on SLURM clusters.**\n", - "\n", - "โš ๏ธ Skip this cell if you're running single-node training!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Provisioner Configuration (OPTIONAL - for multi-node only)\n", - "# Set to None for single-node training\n", - "\n", - "provisioner_config = None # Default: no provisioner\n", - "\n", - "# Uncomment and configure for SLURM multi-node training:\n", - "# provisioner_config = nb.create_provisioner_config(\n", - "# launcher=\"slurm\",\n", - "# job_name=\"sft_training\",\n", - "# partition=\"your_gpu_partition\", # REQUIRED for SLURM\n", - "# time=\"24:00:00\", # REQUIRED for SLURM\n", - "# account=\"your_account\" # May be required\n", - "# )\n", - "\n", - "if provisioner_config:\n", - " print(\"โ˜๏ธ Provisioner Configuration:\")\n", - " for key, value in provisioner_config.items():\n", - " print(f\" โ€ข {key}: {value}\")\n", - "else:\n", - " print(\"โ˜๏ธ Provisioner: Disabled (single-node mode)\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐Ÿ”จ Build Complete Configuration\n", - "\n", - "Combine all configurations into a single config object." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Build complete configuration\n", - "config = nb.build_config(\n", - " model_config=model_config,\n", - " optimizer_config=optimizer_config,\n", - " lr_scheduler_config=lr_scheduler_config,\n", - " training_config=training_config,\n", - " parallelism_config=parallelism_config,\n", - " checkpoint_config=checkpoint_config,\n", - " activation_checkpoint_config=activation_checkpoint_config,\n", - " process_config=process_config,\n", - " provisioner_config=provisioner_config\n", - ")\n", - "\n", - "print(\"โœ… Configuration built successfully!\\n\")\n", - "\n", - "# Display summary\n", - "nb.summarize_config(config)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐Ÿ“„ View Full Configuration (YAML)\n", - "\n", - "See the complete configuration in YAML format." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Print full configuration\n", - "nb.print_config(config, title=\"Complete Training Configuration\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐Ÿ’พ Save Configuration (Optional)\n", - "\n", - "Save the configuration to a YAML file for later use." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from omegaconf import OmegaConf\n", - "\n", - "# Save configuration\n", - "config_path = \"/home/hosseinkh/forge/apps/sft_v2/my_training_config.yaml\"\n", - "with open(config_path, 'w') as f:\n", - " OmegaConf.save(config, f)\n", - "\n", - "print(f\"โœ… Configuration saved to: {config_path}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐Ÿš€ Run Training!\n", - "\n", - "Start the training process with the configured settings.\n", - "\n", - "โš ๏ธ **Note**: This will start actual training and may take a long time!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Run training\n", - "print(\"๐Ÿš€ Starting training...\\n\")\n", - "\n", - "try:\n", - " nb.train(config)\n", - " print(\"\\nโœ… Training completed successfully!\")\n", - "except Exception as e:\n", - " print(f\"\\nโŒ Training failed: {e}\")\n", - " import traceback\n", - " traceback.print_exc()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐Ÿ” Advanced: Step-by-Step Execution\n", - "\n", - "For more control, you can run each training stage separately.\n", - "\n", - "โš ๏ธ **Only run this section if you want manual control. Otherwise, use the cell above.**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Step 1: Initialize provisioner (if configured)\n", - "import asyncio\n", - "\n", - "provisioner_initialized = await nb.initialize_provisioner(config)\n", - "print(f\"Provisioner initialized: {provisioner_initialized}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Step 2: Create recipe\n", - "recipe = await nb.create_recipe(config)\n", - "print(\"Recipe created\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Step 3: Setup recipe (load model, data, etc.)\n", - "await nb.setup_recipe(recipe)\n", - "print(\"Recipe setup complete\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Step 4: Run training\n", - "await nb.train_recipe(recipe)\n", - "print(\"Training complete\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Step 5: Cleanup\n", - "await nb.cleanup_recipe(recipe)\n", - "print(\"Cleanup complete\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Step 6: Shutdown provisioner (if initialized)\n", - "if provisioner_initialized:\n", - " await nb.shutdown_provisioner(config)\n", - " print(\"Provisioner shutdown complete\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐Ÿ“Š Tips & Tricks\n", - "\n", - "### Memory Optimization\n", - "- Use **FSDP** (set `data_parallel_shard_degree=-1`) for large models\n", - "- Enable **activation checkpointing** (set `mode=\"selective\"` or `\"full\"`)\n", - "- Reduce **batch size** or **sequence length**\n", - "\n", - "### Speed Optimization\n", - "- Use **tensor parallelism** for large models (set `tensor_parallel_degree > 1`)\n", - "- Enable **compilation** (set `compile=True`)\n", - "- Increase **batch size** if memory allows\n", - "\n", - "### Multi-Node Training\n", - "- Set `hosts` in process config\n", - "- Configure provisioner with SLURM details\n", - "- Make sure model path is accessible on all nodes\n", - "\n", - "### Debugging\n", - "- Start with fewer steps (e.g., `steps=10`)\n", - "- Use single GPU first (`procs=1`)\n", - "- Check logs for errors" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ๐ŸŽฏ Common Configurations\n", - "\n", - "### Quick Test Run\n", - "```python\n", - "training_config = nb.create_training_config(\n", - " steps=10,\n", - " local_batch_size=1\n", - ")\n", - "process_config = nb.create_process_config(procs=1)\n", - "```\n", - "\n", - "### Single Node, 8 GPUs, FSDP\n", - "```python\n", - "parallelism_config = nb.create_parallelism_config(\n", - " data_parallel_shard_degree=-1 # Use all 8 GPUs with FSDP\n", - ")\n", - "process_config = nb.create_process_config(procs=8)\n", - "```\n", - "\n", - "### Multi-Node, 4ร—8 GPUs, TP=2\n", - "```python\n", - "parallelism_config = nb.create_parallelism_config(\n", - " data_parallel_shard_degree=16, # 32 GPUs / 2 TP = 16 FSDP\n", - " tensor_parallel_degree=2\n", - ")\n", - "process_config = nb.create_process_config(procs=8, hosts=4)\n", - "provisioner_config = nb.create_provisioner_config(\n", - " launcher=\"slurm\",\n", - " partition=\"gpu_partition\"\n", - ")\n", - "```" - ] - } - ], - "metadata": { - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/apps/sft_v2/spawn_actor.py b/apps/sft_v2/spawn_actor.py new file mode 100644 index 000000000..eb9695c76 --- /dev/null +++ b/apps/sft_v2/spawn_actor.py @@ -0,0 +1,139 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +SpawnActor - Orchestrates the spawning and lifecycle management of actors. + +This module provides a high-level interface for creating, setting up, running, +and cleaning up different types of actors (e.g., Trainer, Evaluator, etc.) +""" + +import logging +from typing import Any, Type + +from forge.apps.sft_v2.actor import BaseForgeActor +from omegaconf import DictConfig + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +class SpawnActor: + """ + Orchestrator for spawning and managing actor lifecycles. + + This class handles the creation, setup, execution, and cleanup of actors + in a standardized way. + """ + + def __init__(self, actor_class: Type[BaseForgeActor], config: DictConfig): + """ + Initialize the spawn actor orchestrator. + + Args: + actor_class: The actor class to instantiate (must inherit from BaseForgeActor) + config: Configuration dictionary for the actor + """ + self.actor_class = actor_class + self.config = config + self.actor = None + + if not issubclass(actor_class, BaseForgeActor): + raise TypeError( + f"actor_class must be a subclass of BaseForgeActor, got {actor_class}" + ) + + async def spawn(self) -> Any: + """ + Spawn the actor instance with the given configuration. + + Returns: + The spawned actor instance + """ + logger.info(f"Spawning {self.actor_class.__name__}...") + + process_cfg = self.config.pop("processes", {}) + + self.actor = await self.actor_class.options(**process_cfg).as_actor(self.config) + + logger.info(f"{self.actor_class.__name__} spawned successfully.") + return self.actor + + async def setup(self): + """ + Setup the spawned actor (load data, checkpoints, etc.). + """ + if self.actor is None: + raise RuntimeError( + "Actor must be spawned before setup. Call spawn() first." + ) + + logger.info(f"Setting up {self.actor_class.__name__}...") + await self.actor.setup.call() + logger.info(f"{self.actor_class.__name__} setup complete.") + + async def run(self): + """ + Run the main execution logic of the actor. + """ + if self.actor is None: + raise RuntimeError( + "Actor must be spawned before running. Call spawn() first." + ) + + logger.info(f"Running {self.actor_class.__name__}...") + await self.actor.run.call() + logger.info(f"{self.actor_class.__name__} execution complete.") + + async def cleanup(self): + """ + Cleanup the actor resources and stop the mesh. + """ + if self.actor is None: + raise RuntimeError( + "Actor must be spawned before cleanup. Call spawn() first." + ) + + logger.info(f"Cleaning up {self.actor_class.__name__}...") + await self.actor.cleanup.call() + + if hasattr(self.actor, "mesh"): + await self.actor.mesh.stop() + + logger.info(f"{self.actor_class.__name__} cleanup complete.") + + async def run_full_lifecycle(self): + """ + Execute the complete actor lifecycle: spawn -> setup -> run -> cleanup. + + This is a convenience method that runs all phases in sequence. + """ + logger.info(f"Starting full lifecycle for {self.actor_class.__name__}...") + + try: + await self.spawn() + await self.setup() + await self.run() + finally: + if self.actor is not None: + await self.cleanup() + + logger.info(f"Full lifecycle complete for {self.actor_class.__name__}.") + + +async def run_actor( + actor_class: Type[BaseForgeActor], + config: DictConfig, +) -> None: + """ + Convenience function to run an actor with full lifecycle management. + + Args: + actor_class: The actor class to instantiate + config: Configuration dictionary for the actor + """ + spawner = SpawnActor(actor_class, config) + await spawner.run_full_lifecycle() diff --git a/apps/sft_v2/trainer_actor.py b/apps/sft_v2/trainer_actor.py new file mode 100644 index 000000000..10c5e9b38 --- /dev/null +++ b/apps/sft_v2/trainer_actor.py @@ -0,0 +1,189 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Trainer actor implementation for SFT training. + +This is a concrete implementation of BaseForgeActor for supervised fine-tuning. +""" + +import logging + +import torch +import torchtitan.experiments.forge.train_spec as forge_train_spec +from forge.apps.sft_v2.actor import BaseForgeActor +from forge.apps.sft_v2.utils import ( + create_context_parallel_context, + log_training_step, + move_batch_to_device, + setup_sft_dataloader, + setup_tokenizer, +) +from monarch.actor import endpoint +from omegaconf import DictConfig + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +class TrainerActor(BaseForgeActor): + """ + Concrete trainer actor for supervised fine-tuning. + + Handles training loop, forward/backward passes, and checkpoint management. + """ + + train_spec: forge_train_spec.ForgeTrainSpec + train_dataloader: any + num_training_steps: int + + def __init__(self, config: DictConfig): + """ + Initialize the trainer actor. + + Args: + config: Configuration dictionary containing training settings + """ + super().__init__(config) + self.num_training_steps = self.job_config.training.steps + + @endpoint + async def setup(self): + """ + Setup the trainer (load data, checkpoint, etc.). + """ + logger.info("Setting up trainer actor...") + + self.tokenizer = setup_tokenizer( + hf_assets_path=self.job_config.model.hf_assets_path + ) + + self.train_dataloader = setup_sft_dataloader( + tokenizer=self.tokenizer, + dataset_path="yahma/alpaca-cleaned", + dataset_split="train", + target_tokens_per_pack=self.job_config.training.seq_len, + batch_size=self.job_config.training.local_batch_size, + device=self.device, + ) + + if self.checkpointer: + logger.info("Loading checkpoint...") + self.checkpointer.load(step=self.current_step) + + logger.info("Trainer setup complete.") + + def forward_backward( + self, input_dict: dict[str, torch.Tensor], labels: torch.Tensor + ) -> torch.Tensor: + """ + Perform forward and backward pass. + + Args: + input_dict: Dictionary containing input tokens + labels: Ground truth labels + + Returns: + Computed loss value + """ + model_parts = self.model_parts + parallel_dims = self.parallel_dims + inputs = input_dict["tokens"] + + optional_context_parallel_ctx = create_context_parallel_context( + parallel_dims=parallel_dims, + inputs=inputs, + labels=labels, + model_parts=model_parts, + rotate_method=self.job_config.parallelism.context_parallel_rotate_method, + ) + + if parallel_dims.pp_enabled: + with self.train_context(optional_context_parallel_ctx): + targets, losses = ( + (labels, []) if self.pp_has_last_stage else (None, None) + ) + if self.pp_has_first_stage: + self.pp_schedule.step( + inputs, target=targets, losses=losses, input_batch=inputs + ) + else: + self.pp_schedule.step( + target=targets, losses=losses, input_batch=inputs + ) + + loss = ( + torch.mean(torch.stack(losses)).to(self.device) + if self.pp_has_last_stage + else torch.tensor([-1.0], device=self.device) + ) + else: + with self.train_context(optional_context_parallel_ctx): + assert len(model_parts) == 1 + with self.maybe_enable_amp: + pred = model_parts[0](inputs) + loss = self.loss_fn(pred, labels) + del pred + loss.backward() + + return loss + + def train_step(self, batch: dict[str, torch.Tensor]) -> None: + """ + Execute a single training step. + + Args: + batch: Dictionary containing batch data (tokens, labels, etc.) + """ + labels = batch.pop("labels") + loss = self.forward_backward(batch, labels) + + log_training_step(self.current_step, self.num_training_steps, loss, logger) + + self.optimizers.step() + self.lr_schedulers.step() + + @endpoint + async def run(self) -> None: + """ + Main training loop. + """ + logger.info("Starting training loop...") + + dataloader = iter(self.train_dataloader) + self.optimizers.zero_grad() + + while self.current_step < self.num_training_steps: + batch = next(dataloader) + batch = move_batch_to_device(batch, self.device) + + self.train_step(batch) + self.current_step += 1 + + if self.checkpointer: + self.checkpointer.save( + curr_step=self.current_step, + last_step=self.current_step == self.num_training_steps, + ) + + logger.info("Training complete!") + + @endpoint + async def cleanup(self) -> None: + """ + Cleanup resources (close checkpointer, logger, etc.). + """ + logger.info("Cleaning up trainer actor...") + + if self.checkpointer: + self.checkpointer.close() + if self.metric_logger: + self.metric_logger.close() + + logger.info("Cleanup complete.") + + def __repr__(self) -> str: + return "TrainerActor" diff --git a/apps/sft_v2/utils.py b/apps/sft_v2/utils.py new file mode 100644 index 000000000..6d0219805 --- /dev/null +++ b/apps/sft_v2/utils.py @@ -0,0 +1,187 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Utility functions for SFT training actors. + +These utilities handle data loading, model setup, and common operations. +""" + +import logging +import os +from functools import partial +from typing import Any, Optional + +import torch +from forge.data.collate import collate_packed +from forge.data.datasets.packed import PackedDataset, TextPacker +from forge.data.datasets.sft_dataset import AlpacaToMessages, sft_iterable_dataset +from forge.data.tokenizer import HuggingFaceModelTokenizer +from torchdata.stateful_dataloader import StatefulDataLoader +from torchtitan.distributed import ParallelDims, utils as dist_utils + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +def setup_tokenizer( + hf_assets_path: str, + tokenizer_filename: str = "tokenizer.json", + tokenizer_config_filename: str = "tokenizer_config.json", + generation_config_filename: str = "generation_config.json", +) -> HuggingFaceModelTokenizer: + """ + Setup HuggingFace tokenizer from model assets. + + Args: + hf_assets_path: Path to the directory containing tokenizer files + tokenizer_filename: Name of the tokenizer JSON file + tokenizer_config_filename: Name of the tokenizer config JSON file + generation_config_filename: Name of the generation config JSON file + + Returns: + Initialized HuggingFaceModelTokenizer + """ + tokenizer_json_path = os.path.join(hf_assets_path, tokenizer_filename) + tokenizer_config_path = os.path.join(hf_assets_path, tokenizer_config_filename) + generation_config_path = os.path.join(hf_assets_path, generation_config_filename) + + logger.info(f"Loading tokenizer from: {tokenizer_json_path}") + + tokenizer = HuggingFaceModelTokenizer( + tokenizer_json_path=tokenizer_json_path, + tokenizer_config_json_path=tokenizer_config_path, + generation_config_path=generation_config_path, + ) + + return tokenizer + + +def setup_sft_dataloader( + tokenizer: HuggingFaceModelTokenizer, + dataset_path: str, + dataset_split: str, + target_tokens_per_pack: int, + batch_size: int, + device: torch.device, + padding_idx: int = 0, + message_transform: Optional[Any] = None, +) -> StatefulDataLoader: + """ + Setup dataloader for SFT training. + + Args: + tokenizer: Tokenizer to use for processing text + dataset_path: Path or name of the dataset (e.g., "yahma/alpaca-cleaned") + dataset_split: Dataset split to use (e.g., "train", "validation") + target_tokens_per_pack: Target sequence length for packing + batch_size: Batch size for training + device: Device to move tensors to + padding_idx: Padding token index + message_transform: Transform to convert dataset format to messages + + Returns: + Configured StatefulDataLoader + """ + if message_transform is None: + message_transform = AlpacaToMessages() + + logger.info(f"Loading SFT dataset from: {dataset_path}, split: {dataset_split}") + + dataset = sft_iterable_dataset( + model_transform=tokenizer, + message_transform=message_transform, + path=dataset_path, + split=dataset_split, + ) + + packer = TextPacker(padding_idx=padding_idx) + dataset = PackedDataset( + dataset=dataset, + packer=packer, + target_tokens_per_pack=target_tokens_per_pack, + ) + + dataloader = StatefulDataLoader( + dataset=dataset, + batch_size=batch_size, + collate_fn=partial( + collate_packed, mask_fn=packer.create_block_mask, device=device + ), + ) + + logger.info( + f"Created dataloader with batch_size={batch_size}, target_tokens={target_tokens_per_pack}" + ) + + return dataloader + + +def create_context_parallel_context( + parallel_dims: ParallelDims, + inputs: torch.Tensor, + labels: torch.Tensor, + model_parts: list, + rotate_method: str, +): + """ + Create context parallel context for distributed training. + + Args: + parallel_dims: Parallel dimensions configuration + inputs: Input tensor + labels: Label tensor + model_parts: List of model parts + rotate_method: Context parallel rotation method + + Returns: + Context parallel context or None if CP is not enabled + """ + if not parallel_dims.cp_enabled: + return None + + return dist_utils.create_context_parallel_ctx( + cp_mesh=parallel_dims.world_mesh["cp"], + cp_buffers=[inputs, labels] + [m.freqs_cis for m in model_parts], + cp_seq_dims=[1, 1] + [0 for _ in model_parts], + cp_no_restore_buffers={inputs, labels}, + cp_rotate_method=rotate_method, + ) + + +def move_batch_to_device(batch: dict[str, Any], device: torch.device) -> dict[str, Any]: + """ + Move batch tensors to the specified device. + + Args: + batch: Dictionary containing batch data + device: Target device + + Returns: + Batch with tensors moved to device + """ + for key, value in batch.items(): + if isinstance(value, torch.Tensor): + batch[key] = value.to(device) + return batch + + +def log_training_step( + step: int, + total_steps: int, + loss: torch.Tensor, + logger: logging.Logger, +): + """ + Log training step information. + + Args: + step: Current training step + total_steps: Total number of training steps + loss: Current loss value + logger: Logger instance + """ + logger.info(f"Step {step}/{total_steps} | Loss: {loss.item():.4f}") From a0f62e72c6cd55aee47cfc50be6266b37a1d8e33 Mon Sep 17 00:00:00 2001 From: Hossein Kavianihamedani Date: Tue, 14 Oct 2025 11:05:04 -0700 Subject: [PATCH 3/7] Adding eval loop to the sft --- apps/sft/llama3_8b.yaml | 2 + apps/sft/main.py | 178 +++- apps/sft_v2/NOTEBOOK_GUIDE.md | 847 ------------------ apps/sft_v2/actor.py | 133 --- apps/sft_v2/interactive_config_notebook.ipynb | 629 ------------- apps/sft_v2/spawn_actor.py | 139 --- apps/sft_v2/trainer_actor.py | 189 ---- apps/sft_v2/utils.py | 187 ---- 8 files changed, 139 insertions(+), 2165 deletions(-) delete mode 100644 apps/sft_v2/NOTEBOOK_GUIDE.md delete mode 100644 apps/sft_v2/actor.py delete mode 100644 apps/sft_v2/interactive_config_notebook.ipynb delete mode 100644 apps/sft_v2/spawn_actor.py delete mode 100644 apps/sft_v2/trainer_actor.py delete mode 100644 apps/sft_v2/utils.py diff --git a/apps/sft/llama3_8b.yaml b/apps/sft/llama3_8b.yaml index 43a690c1e..2fd563a6c 100644 --- a/apps/sft/llama3_8b.yaml +++ b/apps/sft/llama3_8b.yaml @@ -33,6 +33,8 @@ training: steps: 1000 compile: false dataset: "c4" + #eval_interval: 500 # Setting eval_interval to run evaluation + #eval_steps: 100 # Number of validation batches during each evaluation run parallelism: data_parallel_replicate_degree: 1 diff --git a/apps/sft/main.py b/apps/sft/main.py index 27a8036d4..97ed4125e 100644 --- a/apps/sft/main.py +++ b/apps/sft/main.py @@ -7,7 +7,6 @@ """To run: python -m apps.sft.main --config apps/sft/llama3_8b.yaml - """ import asyncio @@ -40,8 +39,6 @@ from torchtitan.experiments.forge.engine import ForgeEngine from torchtitan.experiments.forge.job_config import ForgeJobConfig -# from tqdm import tqdm - # stubs for now Checkpointer = Any Dataloader = Any @@ -64,7 +61,7 @@ class ForgeSFTRecipe(ForgeActor, ForgeEngine): checkpointer: Checkpointer tokenizer: Tokenizer train_dataloader: Dataloader - # val_dataloader: Dataloader + val_dataloader: Dataloader metric_logger: MetricLogger profiler: Profiler device: torch.device @@ -81,6 +78,11 @@ def __init__(self, config: DictConfig): self.gradient_accumulation_steps = 1 # Example value, adjust as needed self._rank = current_rank().rank self._size = math.prod(current_size().values()) + + # Evaluation settings + self.eval_interval = job_config.training.get("eval_interval", float("inf")) + self.eval_steps = job_config.training.get("eval_steps", 0) + self._init_dist() super().__init__(job_config) @@ -111,25 +113,23 @@ def _init_dist(self): @endpoint async def setup(self): - self.train_dataloader = self.setup_data() - # self.train_dataloader = self.setup_data( - # self.train_config.train_dataset_config, - # self.train_config.train_dataloader_config, - # self.train_config.packing_config, - # ) - # self.val_dataloader = self.setup_data( - # self.train_config.val_dataset_config, - # self.train_config.val_dataloader_config, - # self.train_config.packing_config, - # ) - - # TODO: confirm that this is working properly - # Should also use load, not dcp_load + # Setup training data (first 90% of train split) + self.train_dataloader = self.setup_data( + dataset_path="yahma/alpaca-cleaned", dataset_split="train[:90%]" + ) + + # Setup validation data (last 10% of train split) + self.val_dataloader = self.setup_data( + dataset_path="yahma/alpaca-cleaned", dataset_split="train[90%:]" + ) + + # Load checkpoint if resuming self.checkpointer.load(step=self.current_step) - # self.profiler = self.setup_profiler(self.train_config.profiler_config) - # self.logger = self.setup_logger(self.train_config.logger_config) - def setup_data(self): + def setup_data( + self, dataset_path: str = "yahma/alpaca-cleaned", dataset_split: str = "train" + ): + """Setup data with configurable dataset path and split.""" print(os.path.join(self.job_config.model.hf_assets_path, "tokenizer.json")) tokenizer = HuggingFaceModelTokenizer( tokenizer_json_path=os.path.join( @@ -146,8 +146,8 @@ def setup_data(self): dataset = sft_iterable_dataset( model_transform=tokenizer, message_transform=AlpacaToMessages(), - path="yahma/alpaca-cleaned", - split="train", + path=dataset_path, + split=dataset_split, ) packer = TextPacker(padding_idx=0) dataset = PackedDataset( @@ -163,10 +163,6 @@ def setup_data(self): ), ) - # Ultimately we probably want something like this - # packer = build_packing_strategy(packing_config) - # dataset = build_dataset(dataset_config) - # dataloader = build_dataloader(dataloader_config, dataset, packer) return dataloader def forward_backward( @@ -206,7 +202,6 @@ def forward_backward( ) # accumulate losses across pipeline microbatches - # TODO: PP+FSDP unexpectedly puts the loss back to the CPU loss = ( torch.mean(torch.stack(losses)).to(self.device) if self.pp_has_last_stage @@ -225,27 +220,125 @@ def forward_backward( return loss + def forward_only( + self, input_dict: dict[str, torch.Tensor], labels: torch.Tensor + ) -> torch.Tensor: + """Forward pass only for evaluation (no backward).""" + model_parts = self.model_parts + parallel_dims = self.parallel_dims + + inputs = input_dict["tokens"] + optional_context_parallel_ctx = ( + dist_utils.create_context_parallel_ctx( + cp_mesh=parallel_dims.world_mesh["cp"], + cp_buffers=[inputs, labels] + [m.freqs_cis for m in model_parts], + cp_seq_dims=[1, 1] + [0 for _ in model_parts], + cp_no_restore_buffers={inputs, labels}, + cp_rotate_method=self.job_config.parallelism.context_parallel_rotate_method, + ) + if parallel_dims.cp_enabled + else None + ) + + if parallel_dims.pp_enabled: + # Pipeline Parallel forward only + with self.train_context(optional_context_parallel_ctx): + targets, losses = ( + (labels, []) if self.pp_has_last_stage else (None, None) + ) + if self.pp_has_first_stage: + self.pp_schedule.step( + inputs, target=targets, losses=losses, input_batch=inputs + ) + else: + self.pp_schedule.step( + target=targets, losses=losses, input_batch=inputs + ) + + loss = ( + torch.mean(torch.stack(losses)).to(self.device) + if self.pp_has_last_stage + else torch.tensor([-1.0], device=self.device) + ) + else: + # Non-PP forward only + with self.train_context(optional_context_parallel_ctx): + assert len(model_parts) == 1 + with self.maybe_enable_amp: + pred = model_parts[0](inputs) + loss = self.loss_fn(pred, labels) + del pred + + return loss + def train_step(self, batch) -> None: - # TODO - # with GradientAccumulation( - # self.gradient_accumulation_steps, - # self.model, - # self.data_parallel_size, - # ) as grad_acc: labels = batch.pop("labels") loss = self.forward_backward(batch, labels) logger.info(f"{self.current_step} / {self.num_training_steps}|Loss: {loss}") - # self.pbar.set_description(f"{self.current_step}|Loss: {loss}") - # self.pbar.update(1) self.optimizers.step() self.lr_schedulers.step() + async def evaluate(self) -> dict[str, float]: + """Run evaluation on validation set (internal method, not an endpoint).""" + logger.info("=" * 50) + logger.info("STARTING EVALUATION ") + logger.info("=" * 50) + + # Set model to eval mode + for model_part in self.model_parts: + model_part.eval() + + val_dataloader = iter(self.val_dataloader) + total_loss = 0.0 + num_batches = 0 + + with torch.no_grad(): + for step in range(self.eval_steps): + try: + batch = next(val_dataloader) + + # Move tensors to device + for k, v in batch.items(): + if isinstance(v, torch.Tensor): + batch[k] = v.to(self.device) + + labels = batch.pop("labels") + loss = self.forward_only(batch, labels) + + total_loss += loss.item() + num_batches += 1 + + logger.info( + f" Eval batch {num_batches}/{self.eval_steps} | Loss: {loss.item():.4f}" + ) + + except StopIteration: + logger.warning("Reached end of validation dataloader early") + break + + # Set model back to train mode + for model_part in self.model_parts: + model_part.train() + + avg_loss = total_loss / max(num_batches, 1) + + metrics = { + "val_loss": avg_loss, + "val_batches": num_batches, + } + + logger.info("-" * 50) + logger.info(f"EVALUATION COMPLETE") + logger.info(f"Validation Loss: {avg_loss:.4f}") + logger.info(f"Batches Evaluated: {num_batches}") + logger.info("=" * 50) + return metrics + @endpoint async def train(self) -> None: dataloader = iter(self.train_dataloader) self.optimizers.zero_grad() - # TODO: tqdm is broken in Monarch actors # self.pbar = tqdm(initial=self.current_step, total=self.num_training_steps) @@ -254,18 +347,21 @@ async def train(self) -> None: # Move tensors to the appropriate device for k, v in batch.items(): if isinstance(v, torch.Tensor): - batch[k] = v.to("cuda") # TODO: hardcoded for now + batch[k] = v.to(self.device) # TODO: hardcoded for now self.train_step(batch) - # self.profiler.step() self.current_step += 1 + # Run evaluation periodically + if self.current_step % self.eval_interval == 0: + eval_metrics = await self.evaluate() + logger.info(f"Step {self.current_step} | Eval metrics: {eval_metrics}") + + # Save checkpoints self.checkpointer.save( curr_step=self.current_step, last_step=self.current_step == self.num_training_steps, ) - # self.pbar.close() - @endpoint async def cleanup(self) -> None: if self.checkpointer: diff --git a/apps/sft_v2/NOTEBOOK_GUIDE.md b/apps/sft_v2/NOTEBOOK_GUIDE.md deleted file mode 100644 index b3524ed31..000000000 --- a/apps/sft_v2/NOTEBOOK_GUIDE.md +++ /dev/null @@ -1,847 +0,0 @@ -# Complete Guide: Interactive Configuration Notebook - -This guide explains step-by-step how to use the interactive configuration notebook for SFT training. - ---- - -## ๐Ÿ“– Table of Contents - -1. [Overview](#overview) -2. [Architecture Components](#architecture-components) -3. [Notebook Step-by-Step](#notebook-step-by-step) -4. [Utility Functions Explained](#utility-functions-explained) -5. [How to Run](#how-to-run) -6. [Common Scenarios](#common-scenarios) -7. [Troubleshooting](#troubleshooting) - ---- - -## Overview - -The interactive configuration notebook (`interactive_config_notebook.ipynb`) allows you to: -- Configure SFT training **without YAML files** -- Define configuration interactively in separate cells -- Easily modify parameters and experiment -- Use pre-built templates for common scenarios - -### What Problem Does This Solve? - -**Before**: You had to edit YAML files, which required: -- External file management -- Reloading files after changes -- Difficult to experiment with different configs - -**After**: You can: -- Define everything in the notebook -- Change values in cells and re-run -- See all configurations clearly -- No external file management needed - ---- - -## Architecture Components - -Before diving into the notebook, let's understand the components: - -### 1. BaseForgeActor (`actor.py`) - -**What it is**: An abstract base class that defines the contract for all actors. - -**What it does**: -- Handles distributed initialization (sets up multi-GPU environment) -- Manages common attributes (model, optimizer, checkpointer, etc.) -- Defines three required methods that subclasses must implement: - - `setup()` - Initialize data, checkpoints, etc. - - `run()` - Main execution logic - - `cleanup()` - Resource cleanup - -**Why it matters**: Provides a consistent interface for different actor types (Trainer, Evaluator, Inferencer, etc.) - -### 2. TrainerActor (`trainer_actor.py`) - -**What it is**: A concrete implementation of BaseForgeActor for training. - -**What it does**: -- Implements the training loop -- Handles forward/backward passes -- Manages checkpointing -- Supports various parallelism strategies (FSDP, Pipeline Parallel, Tensor Parallel) - -**Key Methods**: -- `setup()` - Loads tokenizer, dataset, and checkpoints -- `run()` - Executes the training loop -- `forward_backward()` - Performs forward and backward passes -- `train_step()` - Single training step -- `cleanup()` - Closes resources - -### 3. SpawnActor (`spawn_actor.py`) - -**What it is**: An orchestrator that manages actor lifecycle. - -**What it does**: -- Creates actor instances -- Manages the lifecycle: spawn โ†’ setup โ†’ run โ†’ cleanup -- Provides error handling and cleanup guarantees - -**Key Methods**: -- `spawn()` - Creates the actor instance -- `setup()` - Calls actor's setup -- `run()` - Calls actor's run -- `cleanup()` - Calls actor's cleanup and stops the mesh -- `run_full_lifecycle()` - Executes all phases automatically - -**Why it matters**: Simplifies actor management and ensures proper resource cleanup. - -### 4. Utility Functions (`utils.py`) - -Helper functions for common operations. See [Utility Functions Explained](#utility-functions-explained) section below. - ---- - -## Notebook Step-by-Step - -### Step 1: Import Dependencies - -```python -import asyncio -import logging -from omegaconf import OmegaConf, DictConfig - -from forge.apps.sft_v2.trainer_actor import TrainerActor -from forge.apps.sft_v2.spawn_actor import SpawnActor, run_actor -``` - -**What this does**: -- `asyncio` - For async/await operations (actors run asynchronously) -- `logging` - For logging training progress -- `OmegaConf` - For managing configurations (converts dicts to config objects) -- `TrainerActor` - The training actor we'll use -- `SpawnActor`, `run_actor` - For managing actor lifecycle - -**Why we need it**: These are the core dependencies for running the actor-based training. - ---- - -### Step 2: Configure Model Settings - -```python -model_config = { - "name": "llama3", - "flavor": "8B", - "hf_assets_path": "/tmp/Meta-Llama-3.1-8B-Instruct" -} -``` - -**What this does**: -- `name` - Model architecture type (e.g., "llama3", "llama2") -- `flavor` - Model size (e.g., "8B", "70B", "405B") -- `hf_assets_path` - Path to the model files (tokenizer, weights, config) - -**How to modify**: -- Change `flavor` to use different model sizes -- Update `hf_assets_path` to point to your model location -- Make sure the path contains `tokenizer.json`, `tokenizer_config.json`, and model weights - -**Example variations**: -```python -# For a 70B model -model_config = { - "name": "llama3", - "flavor": "70B", - "hf_assets_path": "/path/to/Meta-Llama-3.1-70B" -} -``` - ---- - -### Step 3: Configure Process Settings - -```python -processes_config = { - "procs": 8, # Number of processes - "with_gpus": True # Use GPUs -} -``` - -**What this does**: -- `procs` - Number of parallel processes (usually = number of GPUs) -- `with_gpus` - Whether to use GPUs or CPUs - -**How to modify**: -- For single GPU: `"procs": 1` -- For 4 GPUs: `"procs": 4` -- For CPU training: `"with_gpus": False` (not recommended for LLMs) - -**Important**: Set `procs` to match your available GPUs! - ---- - -### Step 4: Configure Optimizer Settings - -```python -optimizer_config = { - "name": "AdamW", - "lr": 1e-5, # Learning rate - "eps": 1e-8 -} -``` - -**What this does**: -- `name` - Optimizer type (AdamW is recommended for LLMs) -- `lr` - Learning rate (how fast the model learns) -- `eps` - Epsilon for numerical stability - -**How to modify**: -- **Lower learning rate** (e.g., `1e-6`) for fine-tuning -- **Higher learning rate** (e.g., `5e-5`) for pre-training (use with caution) -- Typical range for fine-tuning: `1e-6` to `1e-4` - -**Tips**: -- Start conservative with `1e-5` or `2e-5` -- If loss explodes, reduce learning rate -- If training is too slow, slightly increase learning rate - ---- - -### Step 5: Configure Learning Rate Scheduler - -```python -lr_scheduler_config = { - "warmup_steps": 200 # Number of warmup steps -} -``` - -**What this does**: -- `warmup_steps` - Number of steps to gradually increase learning rate from 0 to `lr` - -**Why warmup**: Prevents training instability at the beginning by starting with a low learning rate. - -**How to modify**: -- For short training (< 1000 steps): use 10-50 warmup steps -- For medium training (1000-5000 steps): use 100-200 warmup steps -- For long training (> 5000 steps): use 200-500 warmup steps -- Rule of thumb: ~5-10% of total training steps - ---- - -### Step 6: Configure Training Settings - -```python -training_config = { - "local_batch_size": 1, # Batch size per GPU - "seq_len": 2048, # Sequence length - "max_norm": 1.0, # Gradient clipping - "steps": 1000, # Total training steps - "compile": False, # PyTorch compilation - "dataset": "c4" # Dataset name -} -``` - -**What this does**: -- `local_batch_size` - Number of samples per GPU per step -- `seq_len` - Maximum sequence length (in tokens) -- `max_norm` - Gradient clipping threshold (prevents exploding gradients) -- `steps` - Total number of training steps -- `compile` - Enable PyTorch 2.0 compilation (experimental) -- `dataset` - Dataset identifier - -**How to modify**: - -**For Memory Issues**: -- Reduce `seq_len` (e.g., from 2048 to 1024) -- Reduce `local_batch_size` (e.g., from 2 to 1) -- Both reduce memory usage - -**For Faster Training**: -- Increase `local_batch_size` if you have memory -- Use shorter `seq_len` for tasks that don't need long context - -**For Quick Testing**: -- Set `steps` to 10-100 for quick validation - -**Global batch size** = `local_batch_size` ร— `procs` ร— `data_parallel_shard_degree` - ---- - -### Step 7: Configure Parallelism Settings - -```python -parallelism_config = { - "data_parallel_replicate_degree": 1, - "data_parallel_shard_degree": -1, # -1 = use all GPUs for FSDP - "tensor_parallel_degree": 1, - "pipeline_parallel_degree": 1, - "context_parallel_degree": 1, - "expert_parallel_degree": 1, - "disable_loss_parallel": False -} -``` - -**What this does**: - -- **Data Parallel Shard Degree (FSDP)**: Splits model parameters across GPUs - - `-1` means use all available GPUs - - `8` means split across 8 GPUs - - Most common strategy for fine-tuning - -- **Tensor Parallel Degree**: Splits individual layers across GPUs - - Use for very large models that don't fit on single GPU even with FSDP - - `1` means no tensor parallelism - -- **Pipeline Parallel Degree**: Splits model into sequential stages - - Use for extremely large models - - `1` means no pipeline parallelism - -- **Context Parallel Degree**: Splits sequence dimension - - For very long sequences - - `1` means no context parallelism - -**Common Configurations**: - -**Single GPU**: -```python -"data_parallel_shard_degree": 1 -``` - -**8 GPUs with FSDP (recommended)**: -```python -"data_parallel_shard_degree": -1 # or 8 -``` - -**Large Model (70B+) with Tensor Parallelism**: -```python -"data_parallel_shard_degree": 4, -"tensor_parallel_degree": 2 -``` - ---- - -### Step 8: Configure Checkpoint Settings - -```python -checkpoint_config = { - "enable": True, - "folder": "/tmp/Meta-Llama-3.1-8B-Instruct/saved_checkpoints", - "initial_load_path": "/tmp/Meta-Llama-3.1-8B-Instruct/", - "initial_load_in_hf": True, - "last_save_in_hf": True, - "interval": 500, # Save every N steps - "async_mode": "disabled" -} -``` - -**What this does**: -- `enable` - Whether to enable checkpointing -- `folder` - Where to save checkpoints -- `initial_load_path` - Where to load initial weights from -- `initial_load_in_hf` - Load weights in HuggingFace format -- `last_save_in_hf` - Save final checkpoint in HuggingFace format -- `interval` - How often to save (in steps) -- `async_mode` - Async saving mode (use "disabled" for simplicity) - -**How to modify**: -- **Save more frequently**: Reduce `interval` (e.g., 100) -- **Save less frequently**: Increase `interval` (e.g., 1000) -- **Resume training**: Point `initial_load_path` to your checkpoint folder - -**Important**: Make sure `folder` path exists and has enough disk space! - ---- - -### Step 9: Configure Activation Checkpointing - -```python -activation_checkpoint_config = { - "mode": "selective", - "selective_ac_option": "op" -} -``` - -**What this does**: -- Saves memory by recomputing activations during backward pass instead of storing them -- `mode` - Checkpointing mode ("selective" or "full") -- `selective_ac_option` - Which operations to checkpoint - -**Memory vs Speed Trade-off**: -- **Activation checkpointing ON**: Lower memory, slower training -- **Activation checkpointing OFF**: Higher memory, faster training - -**When to use**: Enable when running out of memory. - ---- - -### Step 10: Configure Communication Settings - -```python -comm_config = { - "trace_buf_size": 0 -} -``` - -**What this does**: -- Configuration for distributed communication (required by TorchTitan) -- Usually you don't need to modify this - ---- - -### Step 11: Combine All Configurations - -```python -complete_config = { - "comm": comm_config, - "model": model_config, - "processes": processes_config, - "optimizer": optimizer_config, - "lr_scheduler": lr_scheduler_config, - "training": training_config, - "parallelism": parallelism_config, - "checkpoint": checkpoint_config, - "activation_checkpoint": activation_checkpoint_config -} - -cfg = OmegaConf.create(complete_config) -``` - -**What this does**: -- Combines all configuration sections into one complete config -- Converts to OmegaConf format (allows dot notation access) - -**Prints**: The complete configuration in YAML format for review - ---- - -### Step 12: Run Training (Simple Way) - -```python -await run_actor(TrainerActor, cfg) -``` - -**What this does**: -- Spawns the trainer actor -- Runs setup (loads data, model, checkpoints) -- Runs training loop -- Cleans up resources -- All in one line! - -**When to use**: When you want fully automatic training with no manual intervention. - ---- - -### Alternative: Manual Lifecycle Control - -For more control over the training process: - -#### Create and Spawn the Actor - -```python -spawner = SpawnActor(TrainerActor, cfg) -actor = await spawner.spawn() -``` - -**What this does**: -- Creates a spawner with your config -- Spawns the actor instance (allocates resources, initializes distributed environment) - -#### Setup the Actor - -```python -await spawner.setup() -``` - -**What this does**: -- Loads tokenizer from `hf_assets_path` -- Loads training dataset -- Initializes model -- Loads checkpoint if specified - -**At this point**: You could inspect the actor state before training: -```python -print(f"Current step: {actor.current_step}") -print(f"Device: {actor.device}") -``` - -#### Run Training - -```python -await spawner.run() -``` - -**What this does**: -- Executes the training loop -- Iterates through batches -- Performs forward/backward passes -- Updates weights -- Saves checkpoints at intervals - -#### Cleanup - -```python -await spawner.cleanup() -``` - -**What this does**: -- Closes checkpointer -- Closes logger -- Stops the actor mesh -- Frees resources - -**When to use manual control**: -- When you want to inspect state between phases -- When you want to modify configuration between setup and run -- For debugging purposes - ---- - -## Utility Functions Explained - -The `utils.py` module provides reusable helper functions: - -### 1. `setup_tokenizer()` - -```python -def setup_tokenizer( - hf_assets_path: str, - tokenizer_filename: str = "tokenizer.json", - tokenizer_config_filename: str = "tokenizer_config.json", - generation_config_filename: str = "generation_config.json", -) -> HuggingFaceModelTokenizer -``` - -**What it does**: -- Loads a HuggingFace tokenizer from the model assets directory -- Initializes tokenizer with config and generation settings - -**Parameters**: -- `hf_assets_path` - Path to directory containing tokenizer files -- Other parameters are filenames (usually don't need to change) - -**Returns**: Initialized `HuggingFaceModelTokenizer` object - -**Example**: -```python -tokenizer = setup_tokenizer("/tmp/Meta-Llama-3.1-8B-Instruct") -``` - -**When to use**: If you need to use the tokenizer independently (e.g., for preprocessing data) - ---- - -### 2. `setup_sft_dataloader()` - -```python -def setup_sft_dataloader( - tokenizer: HuggingFaceModelTokenizer, - dataset_path: str, - dataset_split: str, - target_tokens_per_pack: int, - batch_size: int, - device: torch.device, - padding_idx: int = 0, - message_transform: Optional[Any] = None, -) -> StatefulDataLoader -``` - -**What it does**: -- Creates a dataloader for supervised fine-tuning -- Handles data loading, tokenization, and packing -- Returns a StatefulDataLoader (can save/restore state for checkpointing) - -**Parameters**: -- `tokenizer` - Tokenizer to use for text processing -- `dataset_path` - HuggingFace dataset name (e.g., "yahma/alpaca-cleaned") -- `dataset_split` - Which split to use ("train", "validation", "test") -- `target_tokens_per_pack` - Sequence length (same as `seq_len` in config) -- `batch_size` - Batch size (same as `local_batch_size` in config) -- `device` - Which device to move tensors to -- `padding_idx` - Token ID for padding (usually 0) -- `message_transform` - Transform to convert dataset format (default: AlpacaToMessages) - -**Returns**: Configured `StatefulDataLoader` - -**Example**: -```python -dataloader = setup_sft_dataloader( - tokenizer=tokenizer, - dataset_path="yahma/alpaca-cleaned", - dataset_split="train", - target_tokens_per_pack=2048, - batch_size=4, - device=torch.device("cuda"), -) -``` - -**When to use**: If you want to create a custom dataloader outside of TrainerActor - ---- - -### 3. `create_context_parallel_context()` - -```python -def create_context_parallel_context( - parallel_dims: ParallelDims, - inputs: torch.Tensor, - labels: torch.Tensor, - model_parts: list, - rotate_method: str, -) -``` - -**What it does**: -- Creates context for context parallelism (splits sequence across GPUs) -- Returns None if context parallelism is disabled - -**Parameters**: -- `parallel_dims` - Parallel dimensions configuration -- `inputs` - Input tensor -- `labels` - Label tensor -- `model_parts` - List of model parts -- `rotate_method` - Rotation method for context parallel - -**Returns**: Context parallel context or None - -**When to use**: Internally used by TrainerActor. You rarely need to call this directly. - ---- - -### 4. `move_batch_to_device()` - -```python -def move_batch_to_device(batch: dict[str, Any], device: torch.device) -> dict[str, Any] -``` - -**What it does**: -- Moves all tensors in a batch dictionary to the specified device -- Leaves non-tensor values unchanged - -**Parameters**: -- `batch` - Dictionary containing batch data -- `device` - Target device (e.g., `torch.device("cuda")`) - -**Returns**: Batch with tensors moved to device - -**Example**: -```python -batch = {"tokens": tensor, "labels": tensor, "metadata": "some_string"} -batch = move_batch_to_device(batch, torch.device("cuda")) -``` - -**When to use**: Useful when manually processing batches - ---- - -### 5. `log_training_step()` - -```python -def log_training_step( - step: int, - total_steps: int, - loss: torch.Tensor, - logger: logging.Logger, -) -``` - -**What it does**: -- Logs training progress in a formatted way -- Shows current step, total steps, and loss value - -**Parameters**: -- `step` - Current training step -- `total_steps` - Total number of training steps -- `loss` - Current loss tensor -- `logger` - Logger instance - -**Example output**: -``` -Step 100/1000 | Loss: 2.3456 -``` - -**When to use**: Internally used by TrainerActor. You can use it for custom logging. - ---- - -## How to Run - -### Prerequisites - -1. **Download Model**: -```bash -export HF_HUB_DISABLE_XET=1 -forge download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir /tmp/Meta-Llama-3.1-8B-Instruct -``` - -2. **Check GPU Availability**: -```bash -nvidia-smi # Should show your GPUs -``` - -### Running the Notebook - -#### Option 1: Using Jupyter Notebook - -1. **Start Jupyter**: -```bash -cd /home/hosseinkh/TorchForge/forge -jupyter notebook -``` - -2. **Open the notebook**: - - Navigate to `apps/sft_v2/interactive_config_notebook.ipynb` - - Click to open - -3. **Run cells sequentially**: - - Click on first cell, press `Shift + Enter` to run - - Continue through all cells - - Modify configuration cells as needed - - Run Step 12 to start training - -#### Option 2: Using VS Code - -1. **Open notebook in VS Code**: - - File โ†’ Open โ†’ `interactive_config_notebook.ipynb` - -2. **Select Python kernel**: - - Click "Select Kernel" in top right - - Choose your Python environment - -3. **Run cells**: - - Click "Run Cell" button on each cell - - Or press `Shift + Enter` - -#### Option 3: Using Command Line (with simplified entry point) - -```bash -cd /home/hosseinkh/TorchForge/forge -python -m apps.sft_v2.notebook_main --config apps/sft_v2/llama3_8b.yaml -``` - -Note: This uses a YAML file, but you can use the notebook for interactive config. - ---- - -## Common Scenarios - -### Scenario 1: Quick Test (1 GPU, 100 steps) - -```python -# Modify these cells: -processes_config = {"procs": 1, "with_gpus": True} -training_config = { - "local_batch_size": 1, - "seq_len": 1024, - "steps": 100, # Just 100 steps - ... -} -``` - -**Expected time**: 5-10 minutes on A100 - -### Scenario 2: Full Training (8 GPUs, 5000 steps) - -```python -processes_config = {"procs": 8, "with_gpus": True} -training_config = { - "local_batch_size": 2, - "seq_len": 2048, - "steps": 5000, - ... -} -parallelism_config = { - "data_parallel_shard_degree": -1, # Use all 8 GPUs - ... -} -``` - -**Expected time**: Several hours depending on hardware - -### Scenario 3: Memory-Constrained Training - -```python -training_config = { - "local_batch_size": 1, # Small batch - "seq_len": 1024, # Shorter sequence - ... -} -activation_checkpoint_config = { - "mode": "selective", # Enable AC for memory savings - ... -} -``` - -**Use when**: Running out of GPU memory - -### Scenario 4: Resume from Checkpoint - -```python -checkpoint_config = { - "enable": True, - "folder": "/path/to/previous/checkpoints", - "initial_load_path": "/path/to/previous/checkpoints/step_1000", - "interval": 500, - ... -} -``` - -**Use when**: Continuing training from a saved checkpoint - ---- - -## Troubleshooting - -### Problem: "CUDA out of memory" - -**Solutions**: -1. Reduce `seq_len` (e.g., from 2048 to 1024) -2. Reduce `local_batch_size` (e.g., from 2 to 1) -3. Enable activation checkpointing -4. Use more GPUs with FSDP - -### Problem: "Loss is NaN or exploding" - -**Solutions**: -1. Reduce learning rate (e.g., from `1e-5` to `1e-6`) -2. Increase gradient clipping (`max_norm` from 1.0 to 0.5) -3. Increase warmup steps - -### Problem: "Training is too slow" - -**Solutions**: -1. Increase `local_batch_size` if memory allows -2. Use more GPUs -3. Reduce `seq_len` if your task doesn't need long context -4. Enable compilation (`compile: True`) - -### Problem: "Cannot find tokenizer files" - -**Solutions**: -1. Check `hf_assets_path` is correct -2. Ensure path contains `tokenizer.json` and `tokenizer_config.json` -3. Re-download model if files are missing - -### Problem: "Actor spawning fails" - -**Solutions**: -1. Check you have enough GPUs for `procs` -2. Verify CUDA is available (`torch.cuda.is_available()`) -3. Check no other processes are using GPUs - ---- - -## Summary - -**Key Takeaways**: - -1. **Interactive Configuration**: Define all settings in notebook cells, no YAML needed -2. **Step-by-Step**: Configure model, processes, optimizer, training, parallelism, checkpoints separately -3. **Two Ways to Run**: Simple (`run_actor()`) or manual (lifecycle control) -4. **Utility Functions**: Helper functions for tokenization, data loading, device management -5. **Templates Provided**: Quick test, multi-GPU, memory-efficient configs ready to use -6. **Flexible**: Easy to modify parameters and experiment - -**Next Steps**: -1. Download your model -2. Open the notebook -3. Modify configuration cells for your needs -4. Run Step 12 to start training -5. Monitor logs for progress - -Happy Training! ๐Ÿš€ diff --git a/apps/sft_v2/actor.py b/apps/sft_v2/actor.py deleted file mode 100644 index 8607a39c4..000000000 --- a/apps/sft_v2/actor.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -""" -Abstract Actor class for training/inference actors in Forge. - -This provides a base class that can be extended for different types of actors -(e.g., Trainer, Evaluator, Inferencer, etc.) -""" - -import logging -import math -import os -from abc import ABC, abstractmethod -from typing import Any, Optional - -import torch -from forge.controller import ForgeActor -from monarch.actor import current_rank, current_size -from omegaconf import DictConfig, OmegaConf -from torch import nn -from torchtitan.components.loss import LossFunction -from torchtitan.components.lr_scheduler import LRSchedulersContainer -from torchtitan.components.optimizer import OptimizersContainer -from torchtitan.distributed import ParallelDims -from torchtitan.experiments.forge.engine import ForgeEngine -from torchtitan.experiments.forge.job_config import ForgeJobConfig - -Checkpointer = Any -Dataloader = Any -MetricLogger = Any -Profiler = Any -Tokenizer = Any - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -class BaseForgeActor(ForgeActor, ForgeEngine, ABC): - """ - Abstract base class for Forge actors. - - This class handles common initialization, distributed setup, and provides - abstract methods that must be implemented by concrete actor classes. - """ - - job_config: ForgeJobConfig - parallel_dims: ParallelDims - model: list[nn.Module] - loss_fn: Optional[LossFunction] - optimizer: Optional[OptimizersContainer] - lr_scheduler: Optional[LRSchedulersContainer] - checkpointer: Optional[Checkpointer] - tokenizer: Optional[Tokenizer] - metric_logger: Optional[MetricLogger] - profiler: Optional[Profiler] - device: torch.device - - def __init__(self, config: DictConfig): - """ - Initialize the base actor with configuration. - - Args: - config: Configuration dictionary containing job settings - """ - job_config = ForgeJobConfig().to_dict() - job_config = OmegaConf.merge(job_config, config) - - self.current_step = 0 - self.metric_logger = None - self.gradient_accumulation_steps = 1 - self._rank = current_rank().rank - self._size = math.prod(current_size().values()) - - self._init_dist() - super().__init__(job_config) - - def _init_dist(self): - """ - Initialize torch distributed environment. - - Sets up environment variables required for distributed training - in the Monarch actor framework. - """ - env = { - "RANK": str(self._rank), - "LOCAL_RANK": str(self._rank), - "LOCAL_WORLD_SIZE": str(self._size), - "GROUP_RANK": str(self._size), - "GROUP_WORLD_SIZE": str(self._size), - "ROLE_RANK": str(self._rank), - "ROLE_WORLD_SIZE": str(self._size), - "ROLE_NAME": "rank", - "WORLD_SIZE": str(self._size), - "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", - } - os.environ.update(env) - logger.info(f"Initialized distributed environment: {env}") - - @abstractmethod - async def setup(self): - """ - Setup the actor (load data, checkpoint, etc.). - - This method must be implemented by concrete actor classes. - """ - pass - - @abstractmethod - async def run(self): - """ - Main execution logic for the actor. - - This method must be implemented by concrete actor classes. - """ - pass - - @abstractmethod - async def cleanup(self): - """ - Cleanup resources (close checkpointer, logger, etc.). - - This method must be implemented by concrete actor classes. - """ - pass - - @abstractmethod - def __repr__(self) -> str: - """String representation of the actor.""" - pass diff --git a/apps/sft_v2/interactive_config_notebook.ipynb b/apps/sft_v2/interactive_config_notebook.ipynb deleted file mode 100644 index 624f6a08a..000000000 --- a/apps/sft_v2/interactive_config_notebook.ipynb +++ /dev/null @@ -1,629 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# SFT Training - Interactive Configuration Notebook\n", - "\n", - "This notebook allows you to configure and run SFT training **without any YAML files**!\n", - "\n", - "## Benefits\n", - "\n", - "โœ… No external YAML files needed \n", - "โœ… Interactive configuration in separate cells \n", - "โœ… Easy to modify and experiment \n", - "โœ… All configuration visible in notebook \n", - "โœ… Quick templates for common scenarios" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 1: Import Dependencies" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import asyncio\n", - "import logging\n", - "from omegaconf import OmegaConf, DictConfig\n", - "\n", - "from forge.apps.sft_v2.trainer_actor import TrainerActor\n", - "from forge.apps.sft_v2.spawn_actor import SpawnActor, run_actor\n", - "\n", - "logging.basicConfig(\n", - " level=logging.INFO,\n", - " format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 2: Configure Model Settings\n", - "\n", - "Define your model configuration. **Modify these values as needed!**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model_config = {\n", - " \"name\": \"llama3\",\n", - " \"flavor\": \"8B\",\n", - " \"hf_assets_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct\"\n", - "}\n", - "\n", - "print(\"Model Configuration:\")\n", - "print(OmegaConf.to_yaml(OmegaConf.create(model_config)))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 3: Configure Process Settings\n", - "\n", - "Define how many processes to use and whether to use GPUs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "processes_config = {\n", - " \"procs\": 8, # Number of processes\n", - " \"with_gpus\": True # Use GPUs\n", - "}\n", - "\n", - "print(\"Process Configuration:\")\n", - "print(OmegaConf.to_yaml(OmegaConf.create(processes_config)))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 4: Configure Optimizer Settings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "optimizer_config = {\n", - " \"name\": \"AdamW\",\n", - " \"lr\": 1e-5, # Learning rate\n", - " \"eps\": 1e-8\n", - "}\n", - "\n", - "print(\"Optimizer Configuration:\")\n", - "print(OmegaConf.to_yaml(OmegaConf.create(optimizer_config)))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 5: Configure Learning Rate Scheduler" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "lr_scheduler_config = {\n", - " \"warmup_steps\": 200 # Number of warmup steps\n", - "}\n", - "\n", - "print(\"LR Scheduler Configuration:\")\n", - "print(OmegaConf.to_yaml(OmegaConf.create(lr_scheduler_config)))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 6: Configure Training Settings\n", - "\n", - "**Key parameters to adjust for your experiment:**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "training_config = {\n", - " \"local_batch_size\": 1, # Batch size per GPU\n", - " \"seq_len\": 2048, # Sequence length\n", - " \"max_norm\": 1.0, # Gradient clipping\n", - " \"steps\": 1000, # Total training steps\n", - " \"compile\": False, # PyTorch compilation\n", - " \"dataset\": \"c4\" # Dataset name\n", - "}\n", - "\n", - "print(\"Training Configuration:\")\n", - "print(OmegaConf.to_yaml(OmegaConf.create(training_config)))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 7: Configure Parallelism Settings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "parallelism_config = {\n", - " \"data_parallel_replicate_degree\": 1,\n", - " \"data_parallel_shard_degree\": -1, # -1 means use all available GPUs for FSDP\n", - " \"tensor_parallel_degree\": 1,\n", - " \"pipeline_parallel_degree\": 1,\n", - " \"context_parallel_degree\": 1,\n", - " \"expert_parallel_degree\": 1,\n", - " \"disable_loss_parallel\": False\n", - "}\n", - "\n", - "print(\"Parallelism Configuration:\")\n", - "print(OmegaConf.to_yaml(OmegaConf.create(parallelism_config)))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 8: Configure Checkpoint Settings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "checkpoint_config = {\n", - " \"enable\": True,\n", - " \"folder\": \"/tmp/Meta-Llama-3.1-8B-Instruct/saved_checkpoints\",\n", - " \"initial_load_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct/\",\n", - " \"initial_load_in_hf\": True,\n", - " \"last_save_in_hf\": True,\n", - " \"interval\": 500, # Save every N steps\n", - " \"async_mode\": \"disabled\"\n", - "}\n", - "\n", - "print(\"Checkpoint Configuration:\")\n", - "print(OmegaConf.to_yaml(OmegaConf.create(checkpoint_config)))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 9: Configure Activation Checkpointing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "activation_checkpoint_config = {\n", - " \"mode\": \"selective\",\n", - " \"selective_ac_option\": \"op\"\n", - "}\n", - "\n", - "print(\"Activation Checkpoint Configuration:\")\n", - "print(OmegaConf.to_yaml(OmegaConf.create(activation_checkpoint_config)))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 10: Configure Communication Settings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "comm_config = {\n", - " \"trace_buf_size\": 0\n", - "}\n", - "\n", - "print(\"Communication Configuration:\")\n", - "print(OmegaConf.to_yaml(OmegaConf.create(comm_config)))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 11: Combine All Configurations\n", - "\n", - "Now let's merge everything into a complete configuration!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Combine all configs\n", - "complete_config = {\n", - " \"comm\": comm_config,\n", - " \"model\": model_config,\n", - " \"processes\": processes_config,\n", - " \"optimizer\": optimizer_config,\n", - " \"lr_scheduler\": lr_scheduler_config,\n", - " \"training\": training_config,\n", - " \"parallelism\": parallelism_config,\n", - " \"checkpoint\": checkpoint_config,\n", - " \"activation_checkpoint\": activation_checkpoint_config\n", - "}\n", - "\n", - "# Create OmegaConf DictConfig\n", - "cfg = OmegaConf.create(complete_config)\n", - "\n", - "print(\"=\" * 80)\n", - "print(\"COMPLETE CONFIGURATION\")\n", - "print(\"=\" * 80)\n", - "print(OmegaConf.to_yaml(cfg))\n", - "print(\"=\" * 80)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 12: Run Training (Simple Way)\n", - "\n", - "The simplest way - automatic lifecycle management!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Run training with automatic lifecycle management\n", - "await run_actor(TrainerActor, cfg)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Alternative: Manual Lifecycle Control\n", - "\n", - "For more control, manage each phase separately.\n", - "\n", - "### Create and Spawn the Actor" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create the spawner\n", - "spawner = SpawnActor(TrainerActor, cfg)\n", - "\n", - "# Spawn the actor\n", - "actor = await spawner.spawn()\n", - "print(f\"โœ“ Actor spawned: {actor}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Setup the Actor" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Setup (load data, checkpoints, etc.)\n", - "await spawner.setup()\n", - "print(\"โœ“ Actor setup complete\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run Training" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Run training\n", - "await spawner.run()\n", - "print(\"โœ“ Training complete\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Cleanup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Cleanup resources\n", - "await spawner.cleanup()\n", - "print(\"โœ“ Cleanup complete\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---\n", - "\n", - "# Quick Configuration Templates\n", - "\n", - "Here are ready-to-use templates for common scenarios!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Template 1: Quick Test (Single GPU, Small Steps)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "quick_test_config = OmegaConf.create({\n", - " \"comm\": {\"trace_buf_size\": 0},\n", - " \"model\": {\n", - " \"name\": \"llama3\",\n", - " \"flavor\": \"8B\",\n", - " \"hf_assets_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct\"\n", - " },\n", - " \"processes\": {\"procs\": 1, \"with_gpus\": True},\n", - " \"optimizer\": {\"name\": \"AdamW\", \"lr\": 1e-5, \"eps\": 1e-8},\n", - " \"lr_scheduler\": {\"warmup_steps\": 10},\n", - " \"training\": {\n", - " \"local_batch_size\": 1,\n", - " \"seq_len\": 1024,\n", - " \"max_norm\": 1.0,\n", - " \"steps\": 100, # Just 100 steps for quick testing\n", - " \"compile\": False,\n", - " \"dataset\": \"c4\"\n", - " },\n", - " \"parallelism\": {\n", - " \"data_parallel_replicate_degree\": 1,\n", - " \"data_parallel_shard_degree\": 1,\n", - " \"tensor_parallel_degree\": 1,\n", - " \"pipeline_parallel_degree\": 1,\n", - " \"context_parallel_degree\": 1,\n", - " \"expert_parallel_degree\": 1,\n", - " \"disable_loss_parallel\": False\n", - " },\n", - " \"checkpoint\": {\n", - " \"enable\": True,\n", - " \"folder\": \"/tmp/quick_test_checkpoints\",\n", - " \"initial_load_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct/\",\n", - " \"initial_load_in_hf\": True,\n", - " \"last_save_in_hf\": True,\n", - " \"interval\": 50,\n", - " \"async_mode\": \"disabled\"\n", - " },\n", - " \"activation_checkpoint\": {\n", - " \"mode\": \"selective\",\n", - " \"selective_ac_option\": \"op\"\n", - " }\n", - "})\n", - "\n", - "print(\"Quick Test Configuration:\")\n", - "print(OmegaConf.to_yaml(quick_test_config))\n", - "\n", - "# To use: await run_actor(TrainerActor, quick_test_config)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Template 2: Multi-GPU Training (8 GPUs with FSDP)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "multi_gpu_config = OmegaConf.create({\n", - " \"comm\": {\"trace_buf_size\": 0},\n", - " \"model\": {\n", - " \"name\": \"llama3\",\n", - " \"flavor\": \"8B\",\n", - " \"hf_assets_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct\"\n", - " },\n", - " \"processes\": {\"procs\": 8, \"with_gpus\": True},\n", - " \"optimizer\": {\"name\": \"AdamW\", \"lr\": 2e-5, \"eps\": 1e-8},\n", - " \"lr_scheduler\": {\"warmup_steps\": 200},\n", - " \"training\": {\n", - " \"local_batch_size\": 2,\n", - " \"seq_len\": 2048,\n", - " \"max_norm\": 1.0,\n", - " \"steps\": 5000,\n", - " \"compile\": False,\n", - " \"dataset\": \"c4\"\n", - " },\n", - " \"parallelism\": {\n", - " \"data_parallel_replicate_degree\": 1,\n", - " \"data_parallel_shard_degree\": 8, # FSDP across 8 GPUs\n", - " \"tensor_parallel_degree\": 1,\n", - " \"pipeline_parallel_degree\": 1,\n", - " \"context_parallel_degree\": 1,\n", - " \"expert_parallel_degree\": 1,\n", - " \"disable_loss_parallel\": False\n", - " },\n", - " \"checkpoint\": {\n", - " \"enable\": True,\n", - " \"folder\": \"/tmp/multi_gpu_checkpoints\",\n", - " \"initial_load_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct/\",\n", - " \"initial_load_in_hf\": True,\n", - " \"last_save_in_hf\": True,\n", - " \"interval\": 500,\n", - " \"async_mode\": \"disabled\"\n", - " },\n", - " \"activation_checkpoint\": {\n", - " \"mode\": \"selective\",\n", - " \"selective_ac_option\": \"op\"\n", - " }\n", - "})\n", - "\n", - "print(\"Multi-GPU Configuration:\")\n", - "print(OmegaConf.to_yaml(multi_gpu_config))\n", - "\n", - "# To use: await run_actor(TrainerActor, multi_gpu_config)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Template 3: Memory-Efficient Training" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "memory_efficient_config = OmegaConf.create({\n", - " \"comm\": {\"trace_buf_size\": 0},\n", - " \"model\": {\n", - " \"name\": \"llama3\",\n", - " \"flavor\": \"8B\",\n", - " \"hf_assets_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct\"\n", - " },\n", - " \"processes\": {\"procs\": 4, \"with_gpus\": True},\n", - " \"optimizer\": {\"name\": \"AdamW\", \"lr\": 1e-5, \"eps\": 1e-8},\n", - " \"lr_scheduler\": {\"warmup_steps\": 150},\n", - " \"training\": {\n", - " \"local_batch_size\": 1, # Small batch size\n", - " \"seq_len\": 1024, # Shorter sequence\n", - " \"max_norm\": 1.0,\n", - " \"steps\": 2000,\n", - " \"compile\": False,\n", - " \"dataset\": \"c4\"\n", - " },\n", - " \"parallelism\": {\n", - " \"data_parallel_replicate_degree\": 1,\n", - " \"data_parallel_shard_degree\": 4,\n", - " \"tensor_parallel_degree\": 1,\n", - " \"pipeline_parallel_degree\": 1,\n", - " \"context_parallel_degree\": 1,\n", - " \"expert_parallel_degree\": 1,\n", - " \"disable_loss_parallel\": False\n", - " },\n", - " \"checkpoint\": {\n", - " \"enable\": True,\n", - " \"folder\": \"/tmp/memory_efficient_checkpoints\",\n", - " \"initial_load_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct/\",\n", - " \"initial_load_in_hf\": True,\n", - " \"last_save_in_hf\": True,\n", - " \"interval\": 400,\n", - " \"async_mode\": \"disabled\"\n", - " },\n", - " \"activation_checkpoint\": {\n", - " \"mode\": \"selective\", # Saves memory\n", - " \"selective_ac_option\": \"op\"\n", - " }\n", - "})\n", - "\n", - "print(\"Memory-Efficient Configuration:\")\n", - "print(OmegaConf.to_yaml(memory_efficient_config))\n", - "\n", - "# To use: await run_actor(TrainerActor, memory_efficient_config)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---\n", - "\n", - "# Tips & Tricks\n", - "\n", - "## Memory Optimization\n", - "- โฌ‡๏ธ Reduce `seq_len` if running out of memory\n", - "- โฌ‡๏ธ Reduce `local_batch_size` if running out of memory\n", - "- โœ… Enable `activation_checkpoint` for memory savings\n", - "\n", - "## Training Speed\n", - "- โฌ†๏ธ Increase `local_batch_size` for faster training (if memory allows)\n", - "- ๐Ÿš€ Use multiple GPUs with FSDP (`data_parallel_shard_degree > 1`)\n", - "- โšก Enable `compile: true` for PyTorch compilation (experimental)\n", - "\n", - "## Debugging\n", - "- ๐Ÿงช Start with small `steps` (e.g., 10-100) to test quickly\n", - "- ๐Ÿ” Use single GPU first (`procs: 1`)\n", - "- ๐Ÿ“Š Monitor loss values in logs\n", - "\n", - "## Checkpoint Management\n", - "- ๐Ÿ’พ Set `interval` based on how often you want to save\n", - "- ๐Ÿ“ Ensure `folder` path exists and has enough space\n", - "- ๐Ÿ”„ Use `initial_load_path` to resume from checkpoints" - ] - } - ], - "metadata": { - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/apps/sft_v2/spawn_actor.py b/apps/sft_v2/spawn_actor.py deleted file mode 100644 index eb9695c76..000000000 --- a/apps/sft_v2/spawn_actor.py +++ /dev/null @@ -1,139 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -""" -SpawnActor - Orchestrates the spawning and lifecycle management of actors. - -This module provides a high-level interface for creating, setting up, running, -and cleaning up different types of actors (e.g., Trainer, Evaluator, etc.) -""" - -import logging -from typing import Any, Type - -from forge.apps.sft_v2.actor import BaseForgeActor -from omegaconf import DictConfig - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -class SpawnActor: - """ - Orchestrator for spawning and managing actor lifecycles. - - This class handles the creation, setup, execution, and cleanup of actors - in a standardized way. - """ - - def __init__(self, actor_class: Type[BaseForgeActor], config: DictConfig): - """ - Initialize the spawn actor orchestrator. - - Args: - actor_class: The actor class to instantiate (must inherit from BaseForgeActor) - config: Configuration dictionary for the actor - """ - self.actor_class = actor_class - self.config = config - self.actor = None - - if not issubclass(actor_class, BaseForgeActor): - raise TypeError( - f"actor_class must be a subclass of BaseForgeActor, got {actor_class}" - ) - - async def spawn(self) -> Any: - """ - Spawn the actor instance with the given configuration. - - Returns: - The spawned actor instance - """ - logger.info(f"Spawning {self.actor_class.__name__}...") - - process_cfg = self.config.pop("processes", {}) - - self.actor = await self.actor_class.options(**process_cfg).as_actor(self.config) - - logger.info(f"{self.actor_class.__name__} spawned successfully.") - return self.actor - - async def setup(self): - """ - Setup the spawned actor (load data, checkpoints, etc.). - """ - if self.actor is None: - raise RuntimeError( - "Actor must be spawned before setup. Call spawn() first." - ) - - logger.info(f"Setting up {self.actor_class.__name__}...") - await self.actor.setup.call() - logger.info(f"{self.actor_class.__name__} setup complete.") - - async def run(self): - """ - Run the main execution logic of the actor. - """ - if self.actor is None: - raise RuntimeError( - "Actor must be spawned before running. Call spawn() first." - ) - - logger.info(f"Running {self.actor_class.__name__}...") - await self.actor.run.call() - logger.info(f"{self.actor_class.__name__} execution complete.") - - async def cleanup(self): - """ - Cleanup the actor resources and stop the mesh. - """ - if self.actor is None: - raise RuntimeError( - "Actor must be spawned before cleanup. Call spawn() first." - ) - - logger.info(f"Cleaning up {self.actor_class.__name__}...") - await self.actor.cleanup.call() - - if hasattr(self.actor, "mesh"): - await self.actor.mesh.stop() - - logger.info(f"{self.actor_class.__name__} cleanup complete.") - - async def run_full_lifecycle(self): - """ - Execute the complete actor lifecycle: spawn -> setup -> run -> cleanup. - - This is a convenience method that runs all phases in sequence. - """ - logger.info(f"Starting full lifecycle for {self.actor_class.__name__}...") - - try: - await self.spawn() - await self.setup() - await self.run() - finally: - if self.actor is not None: - await self.cleanup() - - logger.info(f"Full lifecycle complete for {self.actor_class.__name__}.") - - -async def run_actor( - actor_class: Type[BaseForgeActor], - config: DictConfig, -) -> None: - """ - Convenience function to run an actor with full lifecycle management. - - Args: - actor_class: The actor class to instantiate - config: Configuration dictionary for the actor - """ - spawner = SpawnActor(actor_class, config) - await spawner.run_full_lifecycle() diff --git a/apps/sft_v2/trainer_actor.py b/apps/sft_v2/trainer_actor.py deleted file mode 100644 index 10c5e9b38..000000000 --- a/apps/sft_v2/trainer_actor.py +++ /dev/null @@ -1,189 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -""" -Trainer actor implementation for SFT training. - -This is a concrete implementation of BaseForgeActor for supervised fine-tuning. -""" - -import logging - -import torch -import torchtitan.experiments.forge.train_spec as forge_train_spec -from forge.apps.sft_v2.actor import BaseForgeActor -from forge.apps.sft_v2.utils import ( - create_context_parallel_context, - log_training_step, - move_batch_to_device, - setup_sft_dataloader, - setup_tokenizer, -) -from monarch.actor import endpoint -from omegaconf import DictConfig - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -class TrainerActor(BaseForgeActor): - """ - Concrete trainer actor for supervised fine-tuning. - - Handles training loop, forward/backward passes, and checkpoint management. - """ - - train_spec: forge_train_spec.ForgeTrainSpec - train_dataloader: any - num_training_steps: int - - def __init__(self, config: DictConfig): - """ - Initialize the trainer actor. - - Args: - config: Configuration dictionary containing training settings - """ - super().__init__(config) - self.num_training_steps = self.job_config.training.steps - - @endpoint - async def setup(self): - """ - Setup the trainer (load data, checkpoint, etc.). - """ - logger.info("Setting up trainer actor...") - - self.tokenizer = setup_tokenizer( - hf_assets_path=self.job_config.model.hf_assets_path - ) - - self.train_dataloader = setup_sft_dataloader( - tokenizer=self.tokenizer, - dataset_path="yahma/alpaca-cleaned", - dataset_split="train", - target_tokens_per_pack=self.job_config.training.seq_len, - batch_size=self.job_config.training.local_batch_size, - device=self.device, - ) - - if self.checkpointer: - logger.info("Loading checkpoint...") - self.checkpointer.load(step=self.current_step) - - logger.info("Trainer setup complete.") - - def forward_backward( - self, input_dict: dict[str, torch.Tensor], labels: torch.Tensor - ) -> torch.Tensor: - """ - Perform forward and backward pass. - - Args: - input_dict: Dictionary containing input tokens - labels: Ground truth labels - - Returns: - Computed loss value - """ - model_parts = self.model_parts - parallel_dims = self.parallel_dims - inputs = input_dict["tokens"] - - optional_context_parallel_ctx = create_context_parallel_context( - parallel_dims=parallel_dims, - inputs=inputs, - labels=labels, - model_parts=model_parts, - rotate_method=self.job_config.parallelism.context_parallel_rotate_method, - ) - - if parallel_dims.pp_enabled: - with self.train_context(optional_context_parallel_ctx): - targets, losses = ( - (labels, []) if self.pp_has_last_stage else (None, None) - ) - if self.pp_has_first_stage: - self.pp_schedule.step( - inputs, target=targets, losses=losses, input_batch=inputs - ) - else: - self.pp_schedule.step( - target=targets, losses=losses, input_batch=inputs - ) - - loss = ( - torch.mean(torch.stack(losses)).to(self.device) - if self.pp_has_last_stage - else torch.tensor([-1.0], device=self.device) - ) - else: - with self.train_context(optional_context_parallel_ctx): - assert len(model_parts) == 1 - with self.maybe_enable_amp: - pred = model_parts[0](inputs) - loss = self.loss_fn(pred, labels) - del pred - loss.backward() - - return loss - - def train_step(self, batch: dict[str, torch.Tensor]) -> None: - """ - Execute a single training step. - - Args: - batch: Dictionary containing batch data (tokens, labels, etc.) - """ - labels = batch.pop("labels") - loss = self.forward_backward(batch, labels) - - log_training_step(self.current_step, self.num_training_steps, loss, logger) - - self.optimizers.step() - self.lr_schedulers.step() - - @endpoint - async def run(self) -> None: - """ - Main training loop. - """ - logger.info("Starting training loop...") - - dataloader = iter(self.train_dataloader) - self.optimizers.zero_grad() - - while self.current_step < self.num_training_steps: - batch = next(dataloader) - batch = move_batch_to_device(batch, self.device) - - self.train_step(batch) - self.current_step += 1 - - if self.checkpointer: - self.checkpointer.save( - curr_step=self.current_step, - last_step=self.current_step == self.num_training_steps, - ) - - logger.info("Training complete!") - - @endpoint - async def cleanup(self) -> None: - """ - Cleanup resources (close checkpointer, logger, etc.). - """ - logger.info("Cleaning up trainer actor...") - - if self.checkpointer: - self.checkpointer.close() - if self.metric_logger: - self.metric_logger.close() - - logger.info("Cleanup complete.") - - def __repr__(self) -> str: - return "TrainerActor" diff --git a/apps/sft_v2/utils.py b/apps/sft_v2/utils.py deleted file mode 100644 index 6d0219805..000000000 --- a/apps/sft_v2/utils.py +++ /dev/null @@ -1,187 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -""" -Utility functions for SFT training actors. - -These utilities handle data loading, model setup, and common operations. -""" - -import logging -import os -from functools import partial -from typing import Any, Optional - -import torch -from forge.data.collate import collate_packed -from forge.data.datasets.packed import PackedDataset, TextPacker -from forge.data.datasets.sft_dataset import AlpacaToMessages, sft_iterable_dataset -from forge.data.tokenizer import HuggingFaceModelTokenizer -from torchdata.stateful_dataloader import StatefulDataLoader -from torchtitan.distributed import ParallelDims, utils as dist_utils - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -def setup_tokenizer( - hf_assets_path: str, - tokenizer_filename: str = "tokenizer.json", - tokenizer_config_filename: str = "tokenizer_config.json", - generation_config_filename: str = "generation_config.json", -) -> HuggingFaceModelTokenizer: - """ - Setup HuggingFace tokenizer from model assets. - - Args: - hf_assets_path: Path to the directory containing tokenizer files - tokenizer_filename: Name of the tokenizer JSON file - tokenizer_config_filename: Name of the tokenizer config JSON file - generation_config_filename: Name of the generation config JSON file - - Returns: - Initialized HuggingFaceModelTokenizer - """ - tokenizer_json_path = os.path.join(hf_assets_path, tokenizer_filename) - tokenizer_config_path = os.path.join(hf_assets_path, tokenizer_config_filename) - generation_config_path = os.path.join(hf_assets_path, generation_config_filename) - - logger.info(f"Loading tokenizer from: {tokenizer_json_path}") - - tokenizer = HuggingFaceModelTokenizer( - tokenizer_json_path=tokenizer_json_path, - tokenizer_config_json_path=tokenizer_config_path, - generation_config_path=generation_config_path, - ) - - return tokenizer - - -def setup_sft_dataloader( - tokenizer: HuggingFaceModelTokenizer, - dataset_path: str, - dataset_split: str, - target_tokens_per_pack: int, - batch_size: int, - device: torch.device, - padding_idx: int = 0, - message_transform: Optional[Any] = None, -) -> StatefulDataLoader: - """ - Setup dataloader for SFT training. - - Args: - tokenizer: Tokenizer to use for processing text - dataset_path: Path or name of the dataset (e.g., "yahma/alpaca-cleaned") - dataset_split: Dataset split to use (e.g., "train", "validation") - target_tokens_per_pack: Target sequence length for packing - batch_size: Batch size for training - device: Device to move tensors to - padding_idx: Padding token index - message_transform: Transform to convert dataset format to messages - - Returns: - Configured StatefulDataLoader - """ - if message_transform is None: - message_transform = AlpacaToMessages() - - logger.info(f"Loading SFT dataset from: {dataset_path}, split: {dataset_split}") - - dataset = sft_iterable_dataset( - model_transform=tokenizer, - message_transform=message_transform, - path=dataset_path, - split=dataset_split, - ) - - packer = TextPacker(padding_idx=padding_idx) - dataset = PackedDataset( - dataset=dataset, - packer=packer, - target_tokens_per_pack=target_tokens_per_pack, - ) - - dataloader = StatefulDataLoader( - dataset=dataset, - batch_size=batch_size, - collate_fn=partial( - collate_packed, mask_fn=packer.create_block_mask, device=device - ), - ) - - logger.info( - f"Created dataloader with batch_size={batch_size}, target_tokens={target_tokens_per_pack}" - ) - - return dataloader - - -def create_context_parallel_context( - parallel_dims: ParallelDims, - inputs: torch.Tensor, - labels: torch.Tensor, - model_parts: list, - rotate_method: str, -): - """ - Create context parallel context for distributed training. - - Args: - parallel_dims: Parallel dimensions configuration - inputs: Input tensor - labels: Label tensor - model_parts: List of model parts - rotate_method: Context parallel rotation method - - Returns: - Context parallel context or None if CP is not enabled - """ - if not parallel_dims.cp_enabled: - return None - - return dist_utils.create_context_parallel_ctx( - cp_mesh=parallel_dims.world_mesh["cp"], - cp_buffers=[inputs, labels] + [m.freqs_cis for m in model_parts], - cp_seq_dims=[1, 1] + [0 for _ in model_parts], - cp_no_restore_buffers={inputs, labels}, - cp_rotate_method=rotate_method, - ) - - -def move_batch_to_device(batch: dict[str, Any], device: torch.device) -> dict[str, Any]: - """ - Move batch tensors to the specified device. - - Args: - batch: Dictionary containing batch data - device: Target device - - Returns: - Batch with tensors moved to device - """ - for key, value in batch.items(): - if isinstance(value, torch.Tensor): - batch[key] = value.to(device) - return batch - - -def log_training_step( - step: int, - total_steps: int, - loss: torch.Tensor, - logger: logging.Logger, -): - """ - Log training step information. - - Args: - step: Current training step - total_steps: Total number of training steps - loss: Current loss value - logger: Logger instance - """ - logger.info(f"Step {step}/{total_steps} | Loss: {loss.item():.4f}") From 53371c63fd61b7cd20989e13afb985bc345c8aae Mon Sep 17 00:00:00 2001 From: Hossein Kavianihamedani Date: Thu, 16 Oct 2025 14:40:50 -0700 Subject: [PATCH 4/7] Implement Epoch-Based Evaluation with Non-Blocking All-Reduce --- apps/sft/main.py | 122 +++++++++-- apps/sft/test_evaluate.py | 437 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 542 insertions(+), 17 deletions(-) create mode 100644 apps/sft/test_evaluate.py diff --git a/apps/sft/main.py b/apps/sft/main.py index 97ed4125e..7d6cfc665 100644 --- a/apps/sft/main.py +++ b/apps/sft/main.py @@ -279,8 +279,29 @@ def train_step(self, batch) -> None: self.optimizers.step() self.lr_schedulers.step() + def _extract_epoch_from_batch(self, batch: dict) -> int | None: + """Extract epoch number from batch metrics.""" + if "metrics" not in batch: + return None + + for metric in batch["metrics"]: + if hasattr(metric, "metric_name") and metric.metric_name == "num_epochs": + return metric.value + return None + async def evaluate(self) -> dict[str, float]: - """Run evaluation on validation set (internal method, not an endpoint).""" + """Run evaluation on validation set for one complete epoch. + + Uses prefetch + non-blocking all_reduce pattern to detect epoch completion + across all ranks without blocking on every batch. + + Pattern: + - Iteration N: Start async all_reduce on next batch's epoch (non-blocking) + - Process current batch while all_reduce completes in background + - Iteration N+1: Check result from previous all_reduce (should be done) + + This overlaps communication with computation for better performance. + """ logger.info("=" * 50) logger.info("STARTING EVALUATION ") logger.info("=" * 50) @@ -292,30 +313,97 @@ async def evaluate(self) -> dict[str, float]: val_dataloader = iter(self.val_dataloader) total_loss = 0.0 num_batches = 0 + starting_epoch = None + + # Prefetch first batch + try: + next_batch = next(val_dataloader) + except StopIteration: + logger.warning("Validation dataloader is empty") + return {"val_loss": 0.0, "val_batches": 0} + + next_should_break = False + pending_work = None # Handle for async all_reduce + epoch_tensor = None # Tensor for all_reduce result with torch.no_grad(): - for step in range(self.eval_steps): - try: - batch = next(val_dataloader) + while True: + # Check result from PREVIOUS iteration's async all_reduce + if pending_work is not None: + pending_work.wait() # Should be complete (or very fast) since we did compute + if epoch_tensor is not None: + next_should_break = epoch_tensor.item() > 0 + pending_work = None + + # Check if we should break (based on previous iteration's check) + if next_should_break: + logger.info( + "Epoch completed across all ranks - stopping evaluation" + ) + break - # Move tensors to device - for k, v in batch.items(): - if isinstance(v, torch.Tensor): - batch[k] = v.to(self.device) + # Check optional cap on eval steps + if self.eval_steps > 0 and num_batches >= self.eval_steps: + logger.info(f"Reached eval_steps cap of {self.eval_steps}") + break - labels = batch.pop("labels") - loss = self.forward_only(batch, labels) + # Use the batch that was prefetched in previous iteration + batch = next_batch - total_loss += loss.item() - num_batches += 1 + # Extract epoch from current batch + current_epoch = self._extract_epoch_from_batch(batch) + if current_epoch is not None and starting_epoch is None: + starting_epoch = current_epoch + logger.info(f"Starting evaluation at epoch {starting_epoch}") - logger.info( - f" Eval batch {num_batches}/{self.eval_steps} | Loss: {loss.item():.4f}" - ) + # Prefetch next batch and start async all_reduce + try: + next_batch = next(val_dataloader) + + # Extract epoch from next batch + next_epoch = self._extract_epoch_from_batch(next_batch) + + # Start NON-BLOCKING all_reduce to check if any rank completed epoch + if next_epoch is not None and starting_epoch is not None: + # Check if next batch indicates epoch completion + epoch_increment = next_epoch - starting_epoch + + if torch.distributed.is_initialized(): + # Create tensor for all_reduce + epoch_tensor = torch.tensor( + [epoch_increment], dtype=torch.long, device=self.device + ) + # Start async all_reduce (returns immediately, doesn't block) + pending_work = torch.distributed.all_reduce( + epoch_tensor, + op=torch.distributed.ReduceOp.MAX, + async_op=True, # NON-BLOCKING - returns immediately + ) + else: + # Single rank case - just check locally + next_should_break = epoch_increment > 0 except StopIteration: - logger.warning("Reached end of validation dataloader early") - break + # No more batches - this is the last one + next_should_break = True + + # Process current batch (while all_reduce completes in background) + # Move tensors to device + for k, v in batch.items(): + if isinstance(v, torch.Tensor): + batch[k] = v.to(self.device) + + labels = batch.pop("labels") + loss = self.forward_only(batch, labels) + # GPU compute happens here while network does all_reduce + + total_loss += loss.item() + num_batches += 1 + + eval_steps_info = f"/{self.eval_steps}" if self.eval_steps > 0 else "" + logger.info( + f" Eval batch {num_batches}{eval_steps_info} | Loss: {loss.item():.4f}" + ) # Set model back to train mode for model_part in self.model_parts: diff --git a/apps/sft/test_evaluate.py b/apps/sft/test_evaluate.py new file mode 100644 index 000000000..57959b09d --- /dev/null +++ b/apps/sft/test_evaluate.py @@ -0,0 +1,437 @@ +""" +Tests for the non-blocking all_reduce evaluation logic in main.py + +This tests the epoch-detection and async all_reduce pattern used to +synchronize evaluation completion across multiple ranks without blocking. +""" + +from dataclasses import dataclass +from unittest.mock import MagicMock, Mock, patch + +import pytest +import torch + + +@dataclass +class MockMetric: + """Mock metric object matching the structure in batch["metrics"]""" + + metric_name: str + value: int + + +class MockTrainer: + """Mock trainer with minimal setup for testing evaluate logic""" + + def __init__(self, eval_steps=0): + self.eval_steps = eval_steps + self.device = torch.device("cpu") + self.model_parts = [Mock()] + + def _extract_epoch_from_batch(self, batch: dict) -> int | None: + """Extract epoch number from batch metrics.""" + if "metrics" not in batch: + return None + + for metric in batch["metrics"]: + if hasattr(metric, "metric_name") and metric.metric_name == "num_epochs": + return metric.value + return None + + def forward_only(self, batch, labels): + """Mock forward pass - returns dummy loss""" + return torch.tensor(1.5) + + +def create_batch_with_epoch(epoch: int, loss_value: float = 1.5): + """Helper to create a mock batch with epoch metadata""" + return { + "input_ids": torch.randn(2, 10), + "attention_mask": torch.ones(2, 10), + "labels": torch.randint(0, 100, (2, 10)), + "metrics": [MockMetric(metric_name="num_epochs", value=epoch)], + } + + +def create_batch_without_epoch(loss_value: float = 1.5): + """Helper to create a mock batch without epoch metadata""" + return { + "input_ids": torch.randn(2, 10), + "attention_mask": torch.ones(2, 10), + "labels": torch.randint(0, 100, (2, 10)), + } + + +class TestExtractEpochFromBatch: + """Test the _extract_epoch_from_batch helper method""" + + def test_extract_epoch_success(self): + """Test extracting epoch from batch with proper metadata""" + trainer = MockTrainer() + batch = create_batch_with_epoch(epoch=5) + + epoch = trainer._extract_epoch_from_batch(batch) + assert epoch == 5 + + def test_extract_epoch_no_metrics(self): + """Test batch without metrics returns None""" + trainer = MockTrainer() + batch = create_batch_without_epoch() + + epoch = trainer._extract_epoch_from_batch(batch) + assert epoch is None + + def test_extract_epoch_wrong_metric_name(self): + """Test batch with metrics but wrong metric_name returns None""" + trainer = MockTrainer() + batch = { + "input_ids": torch.randn(2, 10), + "metrics": [MockMetric(metric_name="other_metric", value=10)], + } + + epoch = trainer._extract_epoch_from_batch(batch) + assert epoch is None + + def test_extract_epoch_multiple_metrics(self): + """Test extracting epoch from batch with multiple metrics""" + trainer = MockTrainer() + batch = { + "input_ids": torch.randn(2, 10), + "metrics": [ + MockMetric(metric_name="loss", value=1.5), + MockMetric(metric_name="num_epochs", value=3), + MockMetric(metric_name="step", value=100), + ], + } + + epoch = trainer._extract_epoch_from_batch(batch) + assert epoch == 3 + + +class TestEvaluationLogic: + """Test the evaluation loop logic (single-rank scenario)""" + + @pytest.mark.asyncio + async def test_single_epoch_completion(self): + """Test that evaluation stops after one complete epoch""" + trainer = MockTrainer(eval_steps=0) # No cap + + # Create batches: 3 from epoch 0, then epoch increments to 1 + batches = [ + create_batch_with_epoch(0), + create_batch_with_epoch(0), + create_batch_with_epoch(0), + create_batch_with_epoch(1), # Epoch increment - should trigger stop + ] + + dataloader = iter(batches) + + # Simulate the evaluation pattern + num_processed = 0 + starting_epoch = None + next_should_break = False + + # Get first batch + next_batch = next(dataloader) + + while True: + if next_should_break: + break + + batch = next_batch + + # Extract epoch from current batch + current_epoch = trainer._extract_epoch_from_batch(batch) + if current_epoch is not None and starting_epoch is None: + starting_epoch = current_epoch + + # Try to prefetch next batch + try: + next_batch = next(dataloader) + next_epoch = trainer._extract_epoch_from_batch(next_batch) + + # Check for epoch increment + if next_epoch is not None and starting_epoch is not None: + epoch_increment = next_epoch - starting_epoch + next_should_break = epoch_increment > 0 + + except StopIteration: + next_should_break = True + + # Process current batch + num_processed += 1 + + # Should have processed 3 batches (stopped when detected epoch 1) + assert num_processed == 3 + assert starting_epoch == 0 + + @pytest.mark.asyncio + async def test_eval_steps_cap(self): + """Test that evaluation respects eval_steps cap""" + trainer = MockTrainer(eval_steps=2) # Cap at 2 batches + + # Create 5 batches all in same epoch + batches = [create_batch_with_epoch(0) for _ in range(5)] + dataloader = iter(batches) + + # Simulate the evaluation pattern + num_processed = 0 + next_should_break = False + + # Get first batch + next_batch = next(dataloader) + + while True: + if next_should_break: + break + + # Check eval_steps cap + if trainer.eval_steps > 0 and num_processed >= trainer.eval_steps: + break + + batch = next_batch + + # Try to prefetch next batch + try: + next_batch = next(dataloader) + except StopIteration: + next_should_break = True + + # Process current batch + num_processed += 1 + + # Should have processed exactly 2 batches (eval_steps cap) + assert num_processed == 2 + + @pytest.mark.asyncio + async def test_empty_dataloader(self): + """Test handling of empty dataloader""" + trainer = MockTrainer(eval_steps=0) + + batches = [] + dataloader = iter(batches) + + # Should raise StopIteration immediately + with pytest.raises(StopIteration): + next_batch = next(dataloader) + + @pytest.mark.asyncio + async def test_single_batch(self): + """Test evaluation with only one batch""" + trainer = MockTrainer(eval_steps=0) + + batches = [create_batch_with_epoch(0)] + dataloader = iter(batches) + + num_processed = 0 + next_should_break = False + + # Get first batch + next_batch = next(dataloader) + + while True: + if next_should_break: + break + + batch = next_batch + + # Try to prefetch next batch + try: + next_batch = next(dataloader) + except StopIteration: + next_should_break = True + + # Process current batch + num_processed += 1 + + # Should have processed 1 batch + assert num_processed == 1 + + @pytest.mark.asyncio + async def test_no_epoch_metadata(self): + """Test evaluation when batches don't have epoch metadata""" + trainer = MockTrainer(eval_steps=3) # Use eval_steps as fallback + + # Create batches without epoch metadata + batches = [create_batch_without_epoch() for _ in range(5)] + dataloader = iter(batches) + + num_processed = 0 + next_should_break = False + next_batch = next(dataloader) + + while True: + if next_should_break: + break + + # Check eval_steps cap (should be the stopping condition) + if trainer.eval_steps > 0 and num_processed >= trainer.eval_steps: + break + + batch = next_batch + + try: + next_batch = next(dataloader) + except StopIteration: + next_should_break = True + + num_processed += 1 + + # Should stop at eval_steps + assert num_processed == 3 + + +class TestAsyncAllReduce: + """Test the async all_reduce pattern with mocked distributed operations""" + + @pytest.mark.asyncio + async def test_async_all_reduce_pattern(self): + """Test the async all_reduce pattern with mock distributed operations""" + + # Mock distributed environment + with patch("torch.distributed.is_initialized", return_value=True): + with patch("torch.distributed.all_reduce") as mock_all_reduce: + + # Create mock Work handle for async operation + mock_work = Mock() + mock_work.wait = Mock() + mock_all_reduce.return_value = mock_work + + trainer = MockTrainer(eval_steps=0) + + # Simulate the async pattern + epoch_tensor = torch.tensor([0], dtype=torch.long) + + # Start async all_reduce (should return immediately) + work_handle = torch.distributed.all_reduce( + epoch_tensor, op=torch.distributed.ReduceOp.MAX, async_op=True + ) + + # Verify it returned immediately with a work handle + assert work_handle is not None + assert mock_all_reduce.called + + # Simulate doing computation here... + + # Wait for completion + work_handle.wait() + assert mock_work.wait.called + + @pytest.mark.asyncio + async def test_multi_rank_epoch_detection(self): + """Test that epoch completion is detected when ANY rank finishes""" + + with patch("torch.distributed.is_initialized", return_value=True): + with patch("torch.distributed.all_reduce") as mock_all_reduce: + + def all_reduce_side_effect(tensor, op, async_op=False): + """Simulate all_reduce MAX operation across ranks + Rank 0: epoch_increment = 0 (still in epoch 0) + Rank 1: epoch_increment = 1 (moved to epoch 1) + MAX = 1, so all ranks should stop + """ + # Simulate MAX operation - set tensor to max value + tensor[0] = 1 # At least one rank has epoch_increment=1 + + if async_op: + mock_work = Mock() + mock_work.wait = Mock() + return mock_work + return None + + mock_all_reduce.side_effect = all_reduce_side_effect + + trainer = MockTrainer(eval_steps=0) + + # Simulate rank 1's perspective: it moved to epoch 1 + starting_epoch = 0 + next_epoch = 1 + epoch_increment = next_epoch - starting_epoch # = 1 + + epoch_tensor = torch.tensor([epoch_increment], dtype=torch.long) + + # Start async all_reduce + work = torch.distributed.all_reduce( + epoch_tensor, op=torch.distributed.ReduceOp.MAX, async_op=True + ) + + # Wait for result + work.wait() + + # Check if should break (any rank has increment > 0) + should_break = epoch_tensor.item() > 0 + + assert should_break is True + assert epoch_tensor.item() == 1 + + +class TestEvaluationIntegration: + """Integration-style tests for the full evaluation flow""" + + @pytest.mark.asyncio + async def test_prefetch_pattern_ordering(self): + """Test that the prefetch pattern processes batches in correct order""" + trainer = MockTrainer(eval_steps=0) + + # Create identifiable batches + batches = [ + { + "id": 0, + "metrics": [MockMetric("num_epochs", 0)], + "labels": torch.zeros(1), + }, + { + "id": 1, + "metrics": [MockMetric("num_epochs", 0)], + "labels": torch.zeros(1), + }, + { + "id": 2, + "metrics": [MockMetric("num_epochs", 0)], + "labels": torch.zeros(1), + }, + { + "id": 3, + "metrics": [MockMetric("num_epochs", 1)], + "labels": torch.zeros(1), + }, + ] + + dataloader = iter(batches) + processed_ids = [] + + # Prefetch first batch + next_batch = next(dataloader) + next_should_break = False + starting_epoch = None + + while True: + if next_should_break: + break + + # Process current batch + batch = next_batch + processed_ids.append(batch["id"]) + + # Extract epoch + current_epoch = trainer._extract_epoch_from_batch(batch) + if current_epoch is not None and starting_epoch is None: + starting_epoch = current_epoch + + # Prefetch next + try: + next_batch = next(dataloader) + next_epoch = trainer._extract_epoch_from_batch(next_batch) + + if next_epoch is not None and starting_epoch is not None: + epoch_increment = next_epoch - starting_epoch + next_should_break = epoch_increment > 0 + except StopIteration: + next_should_break = True + + # Should have processed batches 0, 1, 2 (stopped when detected batch 3 has epoch 1) + assert processed_ids == [0, 1, 2] + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) From 4793948f15fad070cde9202e75b955f5f64cdb4a Mon Sep 17 00:00:00 2001 From: Hossein Kavianihamedani Date: Fri, 17 Oct 2025 14:08:49 -0700 Subject: [PATCH 5/7] Add configurable datasets and validation and shortening the code --- apps/sft/llama3_8b.yaml | 18 ++++- apps/sft/main.py | 147 ++++++++++++++++++---------------------- apps/sft/qwen3_8b.yaml | 15 +++- 3 files changed, 94 insertions(+), 86 deletions(-) diff --git a/apps/sft/llama3_8b.yaml b/apps/sft/llama3_8b.yaml index 2fd563a6c..f24936670 100644 --- a/apps/sft/llama3_8b.yaml +++ b/apps/sft/llama3_8b.yaml @@ -26,15 +26,27 @@ optimizer: lr_scheduler: warmup_steps: 200 +dataset: + path: "yahma/alpaca-cleaned" + split: "train[:95%]" + +dataset_val: + path: "yahma/alpaca-cleaned" + split: "train[95%:]" + training: local_batch_size: 1 seq_len: 2048 max_norm: 1.0 steps: 1000 compile: false - dataset: "c4" - #eval_interval: 500 # Setting eval_interval to run evaluation - #eval_steps: 100 # Number of validation batches during each evaluation run + + +validation: + enabled: true # Enable/disable validation + eval_interval: 100 # Run evaluation every 100 training steps + eval_steps: 50 # Number of batches per evaluation (0 = full epoch) + parallelism: data_parallel_replicate_degree: 1 diff --git a/apps/sft/main.py b/apps/sft/main.py index 7d6cfc665..c694867fb 100644 --- a/apps/sft/main.py +++ b/apps/sft/main.py @@ -79,9 +79,25 @@ def __init__(self, config: DictConfig): self._rank = current_rank().rank self._size = math.prod(current_size().values()) - # Evaluation settings - self.eval_interval = job_config.training.get("eval_interval", float("inf")) - self.eval_steps = job_config.training.get("eval_steps", 0) + # Evaluation settings from validation config + validation_config = job_config.get("validation", {}) + self.validation_enabled = validation_config.get("enabled", False) + + if self.validation_enabled: + self.eval_interval = validation_config.get("eval_interval") + self.eval_steps = validation_config.get("eval_steps") + + if self.eval_interval is None: + raise ValueError( + "validation.eval_interval is required when validation.enabled is true" + ) + if self.eval_steps is None: + raise ValueError( + "validation.eval_steps is required when validation.enabled is true" + ) + else: + self.eval_interval = None + self.eval_steps = None self._init_dist() super().__init__(job_config) @@ -113,23 +129,30 @@ def _init_dist(self): @endpoint async def setup(self): - # Setup training data (first 90% of train split) + # Setup training data from config + dataset_config = self.job_config.get("dataset") + self.train_dataloader = self.setup_data( - dataset_path="yahma/alpaca-cleaned", dataset_split="train[:90%]" + dataset_path=dataset_config.get("path"), + dataset_split=dataset_config.get("split"), ) - # Setup validation data (last 10% of train split) + # Setup validation data from config + dataset_val_config = self.job_config.get("dataset_val", {}) self.val_dataloader = self.setup_data( - dataset_path="yahma/alpaca-cleaned", dataset_split="train[90%:]" + dataset_path=dataset_val_config.get("path", dataset_config.get("path")), + dataset_split=dataset_val_config.get("split", dataset_config.get("split")), ) # Load checkpoint if resuming self.checkpointer.load(step=self.current_step) - def setup_data( - self, dataset_path: str = "yahma/alpaca-cleaned", dataset_split: str = "train" - ): + def setup_data(self, dataset_path: str, dataset_split: str): """Setup data with configurable dataset path and split.""" + if not dataset_path or not dataset_split: + raise ValueError( + f"dataset.path and dataset.split are required in YAML config. Got path={dataset_path}, split={dataset_split}" + ) print(os.path.join(self.job_config.model.hf_assets_path, "tokenizer.json")) tokenizer = HuggingFaceModelTokenizer( tokenizer_json_path=os.path.join( @@ -281,39 +304,26 @@ def train_step(self, batch) -> None: def _extract_epoch_from_batch(self, batch: dict) -> int | None: """Extract epoch number from batch metrics.""" - if "metrics" not in batch: - return None - - for metric in batch["metrics"]: - if hasattr(metric, "metric_name") and metric.metric_name == "num_epochs": - return metric.value + if "metrics" in batch: + for metric in batch["metrics"]: + if ( + hasattr(metric, "metric_name") + and metric.metric_name == "num_epochs" + ): + return metric.value return None async def evaluate(self) -> dict[str, float]: - """Run evaluation on validation set for one complete epoch. - - Uses prefetch + non-blocking all_reduce pattern to detect epoch completion - across all ranks without blocking on every batch. - - Pattern: - - Iteration N: Start async all_reduce on next batch's epoch (non-blocking) - - Process current batch while all_reduce completes in background - - Iteration N+1: Check result from previous all_reduce (should be done) - - This overlaps communication with computation for better performance. - """ + """Run evaluation with async all_reduce for cross-rank epoch synchronization.""" logger.info("=" * 50) - logger.info("STARTING EVALUATION ") + logger.info("STARTING EVALUATION") logger.info("=" * 50) - # Set model to eval mode for model_part in self.model_parts: model_part.eval() val_dataloader = iter(self.val_dataloader) - total_loss = 0.0 - num_batches = 0 - starting_epoch = None + total_loss, num_batches, starting_epoch = 0.0, 0, None # Prefetch first batch try: @@ -322,106 +332,79 @@ async def evaluate(self) -> dict[str, float]: logger.warning("Validation dataloader is empty") return {"val_loss": 0.0, "val_batches": 0} - next_should_break = False - pending_work = None # Handle for async all_reduce - epoch_tensor = None # Tensor for all_reduce result + should_break, pending_work, epoch_tensor = False, None, None with torch.no_grad(): while True: - # Check result from PREVIOUS iteration's async all_reduce + # Wait for previous async all_reduce to complete if pending_work is not None: - pending_work.wait() # Should be complete (or very fast) since we did compute - if epoch_tensor is not None: - next_should_break = epoch_tensor.item() > 0 + pending_work.wait() + should_break = ( + epoch_tensor.item() > 0 if epoch_tensor is not None else False + ) pending_work = None - # Check if we should break (based on previous iteration's check) - if next_should_break: + if should_break: logger.info( "Epoch completed across all ranks - stopping evaluation" ) break - # Check optional cap on eval steps if self.eval_steps > 0 and num_batches >= self.eval_steps: logger.info(f"Reached eval_steps cap of {self.eval_steps}") break - # Use the batch that was prefetched in previous iteration batch = next_batch - # Extract epoch from current batch + # Track starting epoch current_epoch = self._extract_epoch_from_batch(batch) if current_epoch is not None and starting_epoch is None: starting_epoch = current_epoch - logger.info(f"Starting evaluation at epoch {starting_epoch}") - # Prefetch next batch and start async all_reduce + # Prefetch next batch and start async epoch check try: next_batch = next(val_dataloader) - - # Extract epoch from next batch next_epoch = self._extract_epoch_from_batch(next_batch) - # Start NON-BLOCKING all_reduce to check if any rank completed epoch if next_epoch is not None and starting_epoch is not None: - # Check if next batch indicates epoch completion epoch_increment = next_epoch - starting_epoch - if torch.distributed.is_initialized(): - # Create tensor for all_reduce epoch_tensor = torch.tensor( [epoch_increment], dtype=torch.long, device=self.device ) - # Start async all_reduce (returns immediately, doesn't block) pending_work = torch.distributed.all_reduce( epoch_tensor, op=torch.distributed.ReduceOp.MAX, - async_op=True, # NON-BLOCKING - returns immediately + async_op=True, ) else: - # Single rank case - just check locally - next_should_break = epoch_increment > 0 - + should_break = epoch_increment > 0 except StopIteration: - # No more batches - this is the last one - next_should_break = True + should_break = True - # Process current batch (while all_reduce completes in background) - # Move tensors to device + # Process current batch (overlaps with async all_reduce) for k, v in batch.items(): if isinstance(v, torch.Tensor): batch[k] = v.to(self.device) labels = batch.pop("labels") loss = self.forward_only(batch, labels) - # GPU compute happens here while network does all_reduce - total_loss += loss.item() num_batches += 1 - eval_steps_info = f"/{self.eval_steps}" if self.eval_steps > 0 else "" - logger.info( - f" Eval batch {num_batches}{eval_steps_info} | Loss: {loss.item():.4f}" - ) + if num_batches % 10 == 0: + logger.info(f" Eval batch {num_batches} | Loss: {loss.item():.4f}") - # Set model back to train mode for model_part in self.model_parts: model_part.train() avg_loss = total_loss / max(num_batches, 1) - - metrics = { - "val_loss": avg_loss, - "val_batches": num_batches, - } - - logger.info("-" * 50) - logger.info(f"EVALUATION COMPLETE") - logger.info(f"Validation Loss: {avg_loss:.4f}") - logger.info(f"Batches Evaluated: {num_batches}") + logger.info( + f"EVALUATION COMPLETE | Val Loss: {avg_loss:.4f} | Batches: {num_batches}" + ) logger.info("=" * 50) - return metrics + + return {"val_loss": avg_loss, "val_batches": num_batches} @endpoint async def train(self) -> None: @@ -439,8 +422,8 @@ async def train(self) -> None: self.train_step(batch) self.current_step += 1 - # Run evaluation periodically - if self.current_step % self.eval_interval == 0: + # Run evaluation periodically if enabled + if self.validation_enabled and self.current_step % self.eval_interval == 0: eval_metrics = await self.evaluate() logger.info(f"Step {self.current_step} | Eval metrics: {eval_metrics}") diff --git a/apps/sft/qwen3_8b.yaml b/apps/sft/qwen3_8b.yaml index 2ab88bbd3..2d4128065 100644 --- a/apps/sft/qwen3_8b.yaml +++ b/apps/sft/qwen3_8b.yaml @@ -25,13 +25,26 @@ optimizer: lr_scheduler: warmup_steps: 200 +# Dataset configuration +dataset: + path: "yahma/alpaca-cleaned" + split: "train[:95%]" + +dataset_val: + path: "yahma/alpaca-cleaned" + split: "train[95%:]" + training: local_batch_size: 1 seq_len: 2048 max_norm: 1.0 steps: 1000 compile: false - dataset: "c4" + +validation: + enabled: true # Enable/disable validation + eval_interval: 100 # Run evaluation every 100 training steps + eval_steps: 50 # Number of batches per evaluation (0 = full epoch) parallelism: data_parallel_replicate_degree: 1 From 676db88bc2e51d5b1d9355bf44c0e74ccba79dd5 Mon Sep 17 00:00:00 2001 From: Hossein Kavianihamedani Date: Fri, 17 Oct 2025 14:09:05 -0700 Subject: [PATCH 6/7] Add configurable datasets and validation and shortening the code --- apps/sft/llama3_8b_test_eval.yaml | 65 +++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 apps/sft/llama3_8b_test_eval.yaml diff --git a/apps/sft/llama3_8b_test_eval.yaml b/apps/sft/llama3_8b_test_eval.yaml new file mode 100644 index 000000000..65abf164f --- /dev/null +++ b/apps/sft/llama3_8b_test_eval.yaml @@ -0,0 +1,65 @@ +# Test configuration to verify evaluation is working +# Runs very few steps with frequent evaluation + +comm: + trace_buf_size: 0 + +model: + name: llama3 + flavor: 8B + hf_assets_path: /home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct + +processes: + procs: 8 # Just 2 processes for faster testing + with_gpus: true + +optimizer: + name: AdamW + lr: 1e-5 + eps: 1e-8 + + +lr_scheduler: + warmup_steps: 2 + +dataset: + path: "yahma/alpaca-cleaned" + split: "train[:95%]" + +dataset_val: + path: "yahma/alpaca-cleaned" + split: "train[95%:]" + +training: + local_batch_size: 4 + seq_len: 512 # Shorter sequences for speed + max_norm: 1.0 + steps: 100 # Only 10 training steps total + compile: false + +validation: + enabled: true # Enable/disable validation + eval_interval: 100 # Run evaluation every 100 training steps + eval_steps: 50 # Number of batches per evaluation (0 = full epoch) + +parallelism: + data_parallel_replicate_degree: 1 + data_parallel_shard_degree: -1 + tensor_parallel_degree: 2 + pipeline_parallel_degree: 1 + context_parallel_degree: 1 + expert_parallel_degree: 1 + disable_loss_parallel: false + +checkpoint: + enable: true + folder: /home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct/test_eval_checkpoints + initial_load_path: /home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct/ + initial_load_in_hf: true + last_save_in_hf: true + interval: 100 # Don't save frequently during test + async_mode: disabled + +activation_checkpoint: + mode: selective + selective_ac_option: op From 250c0cd28276edc27b3d6ee228dda1f80f4d1ca8 Mon Sep 17 00:00:00 2001 From: Hossein Kavianihamedani Date: Fri, 17 Oct 2025 14:23:00 -0700 Subject: [PATCH 7/7] Removed llama test eval --- apps/sft/llama3_8b_test_eval.yaml | 65 ------------------------------- 1 file changed, 65 deletions(-) delete mode 100644 apps/sft/llama3_8b_test_eval.yaml diff --git a/apps/sft/llama3_8b_test_eval.yaml b/apps/sft/llama3_8b_test_eval.yaml deleted file mode 100644 index 65abf164f..000000000 --- a/apps/sft/llama3_8b_test_eval.yaml +++ /dev/null @@ -1,65 +0,0 @@ -# Test configuration to verify evaluation is working -# Runs very few steps with frequent evaluation - -comm: - trace_buf_size: 0 - -model: - name: llama3 - flavor: 8B - hf_assets_path: /home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct - -processes: - procs: 8 # Just 2 processes for faster testing - with_gpus: true - -optimizer: - name: AdamW - lr: 1e-5 - eps: 1e-8 - - -lr_scheduler: - warmup_steps: 2 - -dataset: - path: "yahma/alpaca-cleaned" - split: "train[:95%]" - -dataset_val: - path: "yahma/alpaca-cleaned" - split: "train[95%:]" - -training: - local_batch_size: 4 - seq_len: 512 # Shorter sequences for speed - max_norm: 1.0 - steps: 100 # Only 10 training steps total - compile: false - -validation: - enabled: true # Enable/disable validation - eval_interval: 100 # Run evaluation every 100 training steps - eval_steps: 50 # Number of batches per evaluation (0 = full epoch) - -parallelism: - data_parallel_replicate_degree: 1 - data_parallel_shard_degree: -1 - tensor_parallel_degree: 2 - pipeline_parallel_degree: 1 - context_parallel_degree: 1 - expert_parallel_degree: 1 - disable_loss_parallel: false - -checkpoint: - enable: true - folder: /home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct/test_eval_checkpoints - initial_load_path: /home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct/ - initial_load_in_hf: true - last_save_in_hf: true - interval: 100 # Don't save frequently during test - async_mode: disabled - -activation_checkpoint: - mode: selective - selective_ac_option: op