From 365345327fe6bbb04e56d85de18b84dadfcbd9fe Mon Sep 17 00:00:00 2001
From: Hossein Kavianihamedani <hosseinkh@fb.com>
Date: Wed, 8 Oct 2025 10:56:34 -0700
Subject: [PATCH 1/7] Submitting an interactive notebook to run SFT

---
 apps/sft_v2/README_NOTEBOOK.md          | 435 ++++++++++++++++++
 apps/sft_v2/notebook_utils.py           | 463 +++++++++++++++++++
 apps/sft_v2/sft_training_notebook.ipynb | 568 ++++++++++++++++++++++++
 3 files changed, 1466 insertions(+)
 create mode 100644 apps/sft_v2/README_NOTEBOOK.md
 create mode 100644 apps/sft_v2/notebook_utils.py
 create mode 100644 apps/sft_v2/sft_training_notebook.ipynb

diff --git a/apps/sft_v2/README_NOTEBOOK.md b/apps/sft_v2/README_NOTEBOOK.md
new file mode 100644
index 000000000..eb70a29ea
--- /dev/null
+++ b/apps/sft_v2/README_NOTEBOOK.md
@@ -0,0 +1,435 @@
+# 🚀 SFT Training Notebook Guide
+
+This directory contains an interactive Jupyter notebook experience for training Language Models with Supervised Fine-Tuning (SFT).
+
+## 📁 Files
+
+### Core Files
+- **`sft_training_notebook.ipynb`** - Main Jupyter notebook for interactive training
+- **`notebook_utils.py`** - Utility functions for notebook-based training
+- **`main.py`** - Original command-line training script (unchanged)
+
+### Configuration Files
+- **`llama3_8b.yaml`** - Original single-node config
+- **`llama3_8b_single_node.yaml`** - Single-node config without provisioner
+- **`llama3_8b_slurm_multinode.yaml`** - Multi-node config with SLURM
+- **`llama3_8b_local.yaml`** - Local testing config
+
+## 🎯 Quick Start
+
+### 1. Open the Notebook
+
+```bash
+cd /home/hosseinkh/forge
+jupyter notebook apps/sft_v2/sft_training_notebook.ipynb
+```
+
+Or in VS Code:
+- Open `apps/sft_v2/sft_training_notebook.ipynb`
+- Select Python kernel
+- Run cells sequentially
+
+### 2. Configure Training
+
+The notebook is organized into sections:
+
+1. **📦 Model Configuration** - Choose model and path
+2. **⚙️ Training Configuration** - Set hyperparameters
+3. **🔧 Optimizer Configuration** - Configure optimizer and LR scheduler
+4. **🔀 Parallelism Configuration** - Set distributed training strategy
+5. **💾 Checkpoint Configuration** - Configure checkpointing
+6. **🖥️ Resource Configuration** - Set number of GPUs/nodes
+7. **☁️ Provisioner Configuration** (optional) - For multi-node SLURM
+
+### 3. Run Training
+
+Execute the "Run Training!" cell to start training with your configuration.
+
+## 📚 Using the Utility Library
+
+The `notebook_utils.py` module provides a clean API for training:
+
+### Configuration Builders
+
+```python
+from apps.sft_v2 import notebook_utils as nb
+
+# Create model config
+model_config = nb.create_model_config(
+    name="llama3",
+    flavor="8B",
+    hf_assets_path="/path/to/model"
+)
+
+# Create training config
+training_config = nb.create_training_config(
+    steps=1000,
+    local_batch_size=1,
+    seq_len=2048
+)
+
+# Create optimizer config
+optimizer_config = nb.create_optimizer_config(
+    name="AdamW",
+    lr=1e-5
+)
+
+# ... configure other components
+
+# Build complete config
+config = nb.build_config(
+    model_config=model_config,
+    training_config=training_config,
+    optimizer_config=optimizer_config,
+    # ... other configs
+)
+```
+
+### Training Functions
+
+```python
+# Simple: run everything
+nb.train(config)
+
+# Advanced: step-by-step control
+import asyncio
+
+async def custom_training():
+    # Initialize
+    await nb.initialize_provisioner(config)
+
+    # Create and setup
+    recipe = await nb.create_recipe(config)
+    await nb.setup_recipe(recipe)
+
+    # Train
+    await nb.train_recipe(recipe)
+
+    # Cleanup
+    await nb.cleanup_recipe(recipe)
+    await nb.shutdown_provisioner(config)
+
+asyncio.run(custom_training())
+```
+
+### Display Utilities
+
+```python
+# Print summary
+nb.summarize_config(config)
+
+# Print full YAML
+nb.print_config(config, title="My Config")
+```
+
+## 🔧 Configuration Functions Reference
+
+### Model Configuration
+
+```python
+nb.create_model_config(
+    name: str = "llama3",
+    flavor: str = "8B",
+    hf_assets_path: str = "/tmp/Meta-Llama-3.1-8B-Instruct"
+)
+```
+
+### Training Configuration
+
+```python
+nb.create_training_config(
+    local_batch_size: int = 1,
+    seq_len: int = 2048,
+    max_norm: float = 1.0,
+    steps: int = 1000,
+    dataset: str = "c4",
+    compile: bool = False
+)
+```
+
+### Optimizer Configuration
+
+```python
+nb.create_optimizer_config(
+    name: str = "AdamW",
+    lr: float = 1e-5,
+    eps: float = 1e-8,
+    weight_decay: float = 0.0,
+    betas: tuple = (0.9, 0.999)
+)
+```
+
+### LR Scheduler Configuration
+
+```python
+nb.create_lr_scheduler_config(
+    warmup_steps: int = 200,
+    decay_steps: Optional[int] = None,
+    min_lr: float = 0.0
+)
+```
+
+### Parallelism Configuration
+
+```python
+nb.create_parallelism_config(
+    data_parallel_replicate_degree: int = 1,
+    data_parallel_shard_degree: int = -1,  # -1 = auto (FSDP)
+    tensor_parallel_degree: int = 1,
+    pipeline_parallel_degree: int = 1,
+    context_parallel_degree: int = 1,
+    expert_parallel_degree: int = 1,
+    disable_loss_parallel: bool = False
+)
+```
+
+### Checkpoint Configuration
+
+```python
+nb.create_checkpoint_config(
+    enable: bool = True,
+    folder: str = "/tmp/checkpoints",
+    initial_load_path: Optional[str] = None,
+    initial_load_in_hf: bool = True,
+    last_save_in_hf: bool = True,
+    interval: int = 500,
+    async_mode: str = "disabled"
+)
+```
+
+### Activation Checkpoint Configuration
+
+```python
+nb.create_activation_checkpoint_config(
+    mode: str = "selective",  # 'selective', 'full', 'none'
+    selective_ac_option: str = "op"
+)
+```
+
+### Process Configuration
+
+```python
+# Single node
+nb.create_process_config(
+    procs: int = 8,
+    with_gpus: bool = True,
+    hosts: Optional[int] = None
+)
+
+# Multi-node
+nb.create_process_config(
+    procs: int = 8,
+    with_gpus: bool = True,
+    hosts: int = 4  # 4 nodes
+)
+```
+
+### Provisioner Configuration (Multi-Node Only)
+
+```python
+nb.create_provisioner_config(
+    launcher: str = "slurm",
+    job_name: str = "sft_training",
+    partition: Optional[str] = None,
+    time: Optional[str] = None,
+    account: Optional[str] = None
+)
+```
+
+## 📖 Example Configurations
+
+### Quick Test (Single GPU, 10 steps)
+
+```python
+model_config = nb.create_model_config(
+    name="llama3",
+    flavor="8B",
+    hf_assets_path="/path/to/model"
+)
+
+training_config = nb.create_training_config(
+    steps=10,
+    local_batch_size=1
+)
+
+process_config = nb.create_process_config(procs=1)
+
+# ... configure other components with defaults
+```
+
+### Single Node, 8 GPUs, FSDP
+
+```python
+parallelism_config = nb.create_parallelism_config(
+    data_parallel_shard_degree=-1  # Use all 8 GPUs with FSDP
+)
+
+process_config = nb.create_process_config(procs=8)
+
+# No provisioner needed
+provisioner_config = None
+```
+
+### Multi-Node, 4×8 GPUs, Tensor Parallel
+
+```python
+parallelism_config = nb.create_parallelism_config(
+    data_parallel_shard_degree=16,  # 32 GPUs / 2 TP = 16 FSDP
+    tensor_parallel_degree=2
+)
+
+process_config = nb.create_process_config(
+    procs=8,
+    hosts=4
+)
+
+provisioner_config = nb.create_provisioner_config(
+    launcher="slurm",
+    job_name="sft_multinode",
+    partition="gpu_partition",
+    time="24:00:00"
+)
+```
+
+## 🎓 Advanced Usage
+
+### Custom Training Loop
+
+You can modify the training loop by creating your own recipe class:
+
+```python
+from apps.sft_v2.main import ForgeSFTRecipe
+
+class CustomRecipe(ForgeSFTRecipe):
+    async def train(self):
+        # Custom training logic
+        dataloader = iter(self.train_dataloader)
+
+        for step in range(self.num_training_steps):
+            batch = next(dataloader)
+            # Custom batch processing
+            self.train_step(batch)
+```
+
+### Experiment Tracking
+
+Integrate with your favorite tracking tool:
+
+```python
+import wandb
+
+# Initialize tracking
+wandb.init(project="sft-training", config=config)
+
+# Train
+nb.train(config)
+
+# Log results
+wandb.log({"final_step": config.training.steps})
+```
+
+### Config Variations
+
+Generate multiple configs for hyperparameter sweeps:
+
+```python
+learning_rates = [1e-5, 5e-5, 1e-4]
+configs = []
+
+for lr in learning_rates:
+    optimizer_config = nb.create_optimizer_config(lr=lr)
+    config = nb.build_config(
+        # ... other configs
+        optimizer_config=optimizer_config
+    )
+    configs.append(config)
+
+# Train all configs
+for config in configs:
+    nb.train(config)
+```
+
+## 🔍 Debugging Tips
+
+### Start Simple
+
+1. **Use 1 GPU first**:
+   ```python
+   process_config = nb.create_process_config(procs=1)
+   ```
+
+2. **Run few steps**:
+   ```python
+   training_config = nb.create_training_config(steps=10)
+   ```
+
+3. **Disable compilation**:
+   ```python
+   training_config = nb.create_training_config(compile=False)
+   ```
+
+### Common Issues
+
+**Memory Errors:**
+- Reduce batch size or sequence length
+- Enable FSDP: `data_parallel_shard_degree=-1`
+- Enable activation checkpointing: `mode="selective"` or `"full"`
+
+**Slow Training:**
+- Increase batch size if memory allows
+- Enable compilation: `compile=True`
+- Use tensor parallelism for large models
+
+**Actor Timeout Errors:**
+- Make sure you're not using provisioner config on single node
+- Check SLURM availability with `sinfo`
+- See `TROUBLESHOOTING_MULTINODE.md` for details
+
+## 📦 Saving and Loading Configs
+
+### Save Config
+
+```python
+from omegaconf import OmegaConf
+
+config_path = "my_config.yaml"
+with open(config_path, 'w') as f:
+    OmegaConf.save(config, f)
+```
+
+### Load Config
+
+```python
+from omegaconf import OmegaConf
+
+config = OmegaConf.load("my_config.yaml")
+nb.train(config)
+```
+
+## 🚀 Next Steps
+
+1. **Start with the notebook**: Open `sft_training_notebook.ipynb` and follow along
+2. **Try a test run**: Configure for 10 steps with 1 GPU
+3. **Scale up**: Increase to 8 GPUs with FSDP
+4. **Go multi-node**: Configure SLURM provisioner for cluster training
+
+## 📚 Additional Resources
+
+- **`MULTINODE_SFT_V2_GUIDE.md`** - Detailed guide on multi-node training
+- **`TROUBLESHOOTING_MULTINODE.md`** - Troubleshooting guide for multi-node issues
+- **`main.py`** - Original implementation for reference
+
+## 🤝 Contributing
+
+To add new configuration options:
+
+1. Add a `create_*_config()` function in `notebook_utils.py`
+2. Update `build_config()` to include the new config
+3. Add a new cell in the notebook to configure it
+4. Update this README
+
+## ⚖️ License
+
+Copyright (c) Meta Platforms, Inc. and affiliates.
+
+Licensed under the BSD-style license found in the LICENSE file.
diff --git a/apps/sft_v2/notebook_utils.py b/apps/sft_v2/notebook_utils.py
new file mode 100644
index 000000000..b3636fd26
--- /dev/null
+++ b/apps/sft_v2/notebook_utils.py
@@ -0,0 +1,463 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Utility functions for notebook-based SFT training.
+This module provides a clean API for interactive training in Jupyter notebooks.
+"""
+
+import asyncio
+import logging
+from typing import Any, Dict, Optional
+
+import torch
+
+from apps.sft_v2.main import ForgeSFTRecipe
+from omegaconf import DictConfig, OmegaConf
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+# ============================================================================
+# Configuration Builders
+# ============================================================================
+
+
+def create_model_config(
+    name: str = "llama3",
+    flavor: str = "8B",
+    hf_assets_path: str = "/tmp/Meta-Llama-3.1-8B-Instruct",
+) -> Dict[str, Any]:
+    """
+    Create model configuration.
+
+    Args:
+        name: Model architecture name (e.g., 'llama3', 'llama2')
+        flavor: Model size (e.g., '8B', '70B')
+        hf_assets_path: Path to HuggingFace model assets
+
+    Returns:
+        Dictionary with model configuration
+    """
+    return {
+        "name": name,
+        "flavor": flavor,
+        "hf_assets_path": hf_assets_path,
+    }
+
+
+def create_optimizer_config(
+    name: str = "AdamW",
+    lr: float = 1e-5,
+    eps: float = 1e-8,
+    weight_decay: float = 0.0,
+    betas: tuple = (0.9, 0.999),
+) -> Dict[str, Any]:
+    """
+    Create optimizer configuration.
+
+    Args:
+        name: Optimizer name (e.g., 'AdamW', 'Adam', 'SGD')
+        lr: Learning rate
+        eps: Epsilon for numerical stability
+        weight_decay: L2 regularization coefficient
+        betas: Coefficients for computing running averages
+
+    Returns:
+        Dictionary with optimizer configuration
+    """
+    return {
+        "name": name,
+        "lr": lr,
+        "eps": eps,
+        "weight_decay": weight_decay,
+        "betas": list(betas),
+    }
+
+
+def create_lr_scheduler_config(
+    warmup_steps: int = 200,
+    decay_steps: Optional[int] = None,
+    min_lr: float = 0.0,
+) -> Dict[str, Any]:
+    """
+    Create learning rate scheduler configuration.
+
+    Args:
+        warmup_steps: Number of warmup steps
+        decay_steps: Number of decay steps (None = no decay)
+        min_lr: Minimum learning rate
+
+    Returns:
+        Dictionary with LR scheduler configuration
+    """
+    config = {"warmup_steps": warmup_steps}
+    if decay_steps is not None:
+        config["decay_steps"] = decay_steps
+    if min_lr > 0:
+        config["min_lr"] = min_lr
+    return config
+
+
+def create_training_config(
+    local_batch_size: int = 1,
+    seq_len: int = 2048,
+    max_norm: float = 1.0,
+    steps: int = 1000,
+    dataset: str = "c4",
+    compile: bool = False,
+) -> Dict[str, Any]:
+    """
+    Create training configuration.
+
+    Args:
+        local_batch_size: Batch size per GPU
+        seq_len: Sequence length
+        max_norm: Gradient clipping max norm
+        steps: Total training steps
+        dataset: Dataset name
+        compile: Whether to use torch.compile
+
+    Returns:
+        Dictionary with training configuration
+    """
+    return {
+        "local_batch_size": local_batch_size,
+        "seq_len": seq_len,
+        "max_norm": max_norm,
+        "steps": steps,
+        "dataset": dataset,
+        "compile": compile,
+    }
+
+
+def create_parallelism_config(
+    data_parallel_replicate_degree: int = 1,
+    data_parallel_shard_degree: int = -1,
+    tensor_parallel_degree: int = 1,
+    pipeline_parallel_degree: int = 1,
+    context_parallel_degree: int = 1,
+    expert_parallel_degree: int = 1,
+    disable_loss_parallel: bool = False,
+) -> Dict[str, Any]:
+    """
+    Create parallelism configuration.
+
+    Args:
+        data_parallel_replicate_degree: Data parallel replication
+        data_parallel_shard_degree: Data parallel sharding (FSDP), -1 = auto
+        tensor_parallel_degree: Tensor parallelism degree
+        pipeline_parallel_degree: Pipeline parallelism degree
+        context_parallel_degree: Context parallelism degree
+        expert_parallel_degree: Expert parallelism degree (for MoE)
+        disable_loss_parallel: Whether to disable loss parallelism
+
+    Returns:
+        Dictionary with parallelism configuration
+    """
+    return {
+        "data_parallel_replicate_degree": data_parallel_replicate_degree,
+        "data_parallel_shard_degree": data_parallel_shard_degree,
+        "tensor_parallel_degree": tensor_parallel_degree,
+        "pipeline_parallel_degree": pipeline_parallel_degree,
+        "context_parallel_degree": context_parallel_degree,
+        "expert_parallel_degree": expert_parallel_degree,
+        "disable_loss_parallel": disable_loss_parallel,
+    }
+
+
+def create_checkpoint_config(
+    enable: bool = True,
+    folder: str = "/tmp/checkpoints",
+    initial_load_path: Optional[str] = None,
+    initial_load_in_hf: bool = True,
+    last_save_in_hf: bool = True,
+    interval: int = 500,
+    async_mode: str = "disabled",
+) -> Dict[str, Any]:
+    """
+    Create checkpoint configuration.
+
+    Args:
+        enable: Whether to enable checkpointing
+        folder: Path to save checkpoints
+        initial_load_path: Path to load initial checkpoint from
+        initial_load_in_hf: Load initial checkpoint in HF format
+        last_save_in_hf: Save last checkpoint in HF format
+        interval: Steps between checkpoints
+        async_mode: Async checkpoint mode ('disabled', 'async', etc.)
+
+    Returns:
+        Dictionary with checkpoint configuration
+    """
+    return {
+        "enable": enable,
+        "folder": folder,
+        "initial_load_path": initial_load_path,
+        "initial_load_in_hf": initial_load_in_hf,
+        "last_save_in_hf": last_save_in_hf,
+        "interval": interval,
+        "async_mode": async_mode,
+    }
+
+
+def create_activation_checkpoint_config(
+    mode: str = "selective",
+    selective_ac_option: str = "op",
+) -> Dict[str, Any]:
+    """
+    Create activation checkpointing configuration.
+
+    Args:
+        mode: Activation checkpoint mode ('selective', 'full', 'none')
+        selective_ac_option: Selective AC option ('op', 'layer', etc.)
+
+    Returns:
+        Dictionary with activation checkpoint configuration
+    """
+    return {
+        "mode": mode,
+        "selective_ac_option": selective_ac_option,
+    }
+
+
+def create_process_config(
+    procs: int = 8,
+    with_gpus: bool = True,
+    hosts: Optional[int] = None,
+) -> Dict[str, Any]:
+    """
+    Create process configuration.
+
+    Args:
+        procs: Number of processes per host
+        with_gpus: Whether to use GPUs
+        hosts: Number of hosts (None = single node)
+
+    Returns:
+        Dictionary with process configuration
+    """
+    config = {
+        "procs": procs,
+        "with_gpus": with_gpus,
+    }
+    if hosts is not None:
+        config["hosts"] = hosts
+    return config
+
+
+# ============================================================================
+# Configuration Assembly
+# ============================================================================
+
+
+def build_config(
+    model_config: Dict[str, Any],
+    optimizer_config: Dict[str, Any],
+    lr_scheduler_config: Dict[str, Any],
+    training_config: Dict[str, Any],
+    parallelism_config: Dict[str, Any],
+    checkpoint_config: Dict[str, Any],
+    activation_checkpoint_config: Dict[str, Any],
+    process_config: Dict[str, Any],
+) -> DictConfig:
+    """
+    Build complete configuration from component configs.
+
+    Args:
+        model_config: Model configuration
+        optimizer_config: Optimizer configuration
+        lr_scheduler_config: LR scheduler configuration
+        training_config: Training configuration
+        parallelism_config: Parallelism configuration
+        checkpoint_config: Checkpoint configuration
+        activation_checkpoint_config: Activation checkpoint configuration
+        process_config: Process configuration
+
+    Returns:
+        Complete OmegaConf DictConfig
+    """
+    config = {
+        "comm": {"trace_buf_size": 0},
+        "model": model_config,
+        "optimizer": optimizer_config,
+        "lr_scheduler": lr_scheduler_config,
+        "training": training_config,
+        "parallelism": parallelism_config,
+        "checkpoint": checkpoint_config,
+        "activation_checkpoint": activation_checkpoint_config,
+        "processes": process_config,
+    }
+
+    return OmegaConf.create(config)
+
+
+# ============================================================================
+# Training Functions
+# ============================================================================
+
+
+async def create_recipe(config: DictConfig):
+    """
+    Create and return a ForgeSFTRecipe actor.
+
+    Args:
+        config: Complete configuration
+
+    Returns:
+        ForgeSFTRecipe actor instance
+    """
+    process_cfg = config.pop("processes")
+    recipe = await ForgeSFTRecipe.options(**process_cfg).as_actor(config)
+    logger.info("Recipe created successfully")
+    return recipe
+
+
+async def setup_recipe(recipe):
+    """
+    Setup the recipe (load model, initialize data loaders, etc.).
+
+    Args:
+        recipe: ForgeSFTRecipe actor instance
+    """
+    logger.info("Setting up recipe...")
+    await recipe.setup.call()
+    logger.info("Recipe setup complete")
+
+
+async def train_recipe(recipe):
+    """
+    Run training on the recipe.
+
+    Args:
+        recipe: ForgeSFTRecipe actor instance
+    """
+    logger.info("Starting training...")
+    await recipe.train.call()
+    logger.info("Training complete")
+
+
+async def cleanup_recipe(recipe):
+    """
+    Cleanup recipe resources.
+
+    Args:
+        recipe: ForgeSFTRecipe actor instance
+    """
+    logger.info("Cleaning up...")
+    await recipe.cleanup.call()
+    await recipe.mesh.stop()
+    logger.info("Cleanup complete")
+
+
+# ============================================================================
+# High-Level Training API
+# ============================================================================
+
+
+async def run_training(config: DictConfig):
+    """
+    Run complete training pipeline with the given configuration.
+
+    Args:
+        config: Complete configuration
+
+    Raises:
+        Exception: If training fails
+    """
+    # Create recipe
+    recipe = await create_recipe(config)
+
+    # Setup
+    await setup_recipe(recipe)
+
+    # Train
+    await train_recipe(recipe)
+
+    # Cleanup
+    await cleanup_recipe(recipe)
+
+
+def train(config: DictConfig):
+    """
+    Synchronous wrapper for run_training.
+
+    Args:
+        config: Complete configuration
+    """
+    asyncio.run(run_training(config))
+
+
+# ============================================================================
+# Display Utilities
+# ============================================================================
+
+
+def print_config(config: DictConfig, title: str = "Configuration"):
+    """
+    Pretty print configuration.
+
+    Args:
+        config: Configuration to print
+        title: Title for the output
+    """
+    print(f"\n{'='*60}")
+    print(f"{title:^60}")
+    print(f"{'='*60}")
+    print(OmegaConf.to_yaml(config))
+    print(f"{'='*60}\n")
+
+
+def summarize_config(config: DictConfig):
+    """
+    Print a summary of the configuration.
+
+    Args:
+        config: Configuration to summarize
+    """
+    print("\n" + "=" * 60)
+    print("Configuration Summary".center(60))
+    print("=" * 60)
+
+    print(f"\n📦 Model:")
+    print(f"  • Name: {config.model.name}")
+    print(f"  • Flavor: {config.model.flavor}")
+    print(f"  • Path: {config.model.hf_assets_path}")
+
+    print(f"\n⚙️  Training:")
+    print(f"  • Steps: {config.training.steps}")
+    print(f"  • Batch Size: {config.training.local_batch_size}")
+    print(f"  • Sequence Length: {config.training.seq_len}")
+    print(f"  • Dataset: {config.training.dataset}")
+
+    print(f"\n🔧 Optimizer:")
+    print(f"  • Name: {config.optimizer.name}")
+    print(f"  • Learning Rate: {config.optimizer.lr}")
+    print(f"  • Warmup Steps: {config.lr_scheduler.warmup_steps}")
+
+    print(f"\n🔀 Parallelism:")
+    print(
+        f"  • Data Parallel (Replicate): {config.parallelism.data_parallel_replicate_degree}"
+    )
+    print(
+        f"  • Data Parallel (Shard/FSDP): {config.parallelism.data_parallel_shard_degree}"
+    )
+    print(f"  • Tensor Parallel: {config.parallelism.tensor_parallel_degree}")
+    print(f"  • Pipeline Parallel: {config.parallelism.pipeline_parallel_degree}")
+
+    print(f"\n💾 Checkpointing:")
+    print(f"  • Enabled: {config.checkpoint.enable}")
+    print(f"  • Folder: {config.checkpoint.folder}")
+    print(f"  • Interval: {config.checkpoint.interval} steps")
+
+    print(f"\n🖥️  Resources:")
+    if "hosts" in config.processes:
+        print(f"  • Hosts: {config.processes.hosts}")
+    print(f"  • Processes per host: {config.processes.procs}")
+    print(f"  • GPUs: {config.processes.with_gpus}")
+
+    print("\n" + "=" * 60 + "\n")
diff --git a/apps/sft_v2/sft_training_notebook.ipynb b/apps/sft_v2/sft_training_notebook.ipynb
new file mode 100644
index 000000000..204ec15a9
--- /dev/null
+++ b/apps/sft_v2/sft_training_notebook.ipynb
@@ -0,0 +1,568 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# 🚀 SFT Training Notebook\n",
+        "\n",
+        "This notebook provides an interactive interface for training Language Models using Supervised Fine-Tuning (SFT).\n",
+        "\n",
+        "## Features\n",
+        "- ✅ Interactive configuration in separate cells\n",
+        "- ✅ Support for single-node and multi-node training\n",
+        "- ✅ Easy hyperparameter tuning\n",
+        "- ✅ Flexible parallelism strategies\n",
+        "- ✅ Checkpoint management\n",
+        "\n",
+        "## Quick Start\n",
+        "1. Configure each section (model, training, etc.)\n",
+        "2. Review the complete configuration\n",
+        "3. Run training!"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 📚 Imports"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import sys\n",
+        "sys.path.insert(0, '/home/hosseinkh/forge')\n",
+        "\n",
+        "from apps.sft_v2 import notebook_utils as nb\n",
+        "import torch\n",
+        "\n",
+        "print(f\"✅ Imports successful!\")\n",
+        "print(f\"📊 PyTorch version: {torch.__version__}\")\n",
+        "print(f\"🎮 CUDA available: {torch.cuda.is_available()}\")\n",
+        "if torch.cuda.is_available():\n",
+        "    print(f\"🔢 Number of GPUs: {torch.cuda.device_count()}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 📦 Model Configuration\n",
+        "\n",
+        "Configure the model you want to train."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Model Configuration\n",
+        "model_config = nb.create_model_config(\n",
+        "    name=\"llama3\",\n",
+        "    flavor=\"8B\",\n",
+        "    hf_assets_path=\"/mnt/home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct\"\n",
+        ")\n",
+        "\n",
+        "print(\"📦 Model Configuration:\")\n",
+        "for key, value in model_config.items():\n",
+        "    print(f\"  • {key}: {value}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## ⚙️ Training Configuration\n",
+        "\n",
+        "Set training hyperparameters."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Training Configuration\n",
+        "training_config = nb.create_training_config(\n",
+        "    local_batch_size=1,      # Batch size per GPU\n",
+        "    seq_len=2048,            # Sequence length\n",
+        "    max_norm=1.0,            # Gradient clipping\n",
+        "    steps=1000,              # Total training steps\n",
+        "    dataset=\"c4\",            # Dataset name\n",
+        "    compile=False            # Use torch.compile?\n",
+        ")\n",
+        "\n",
+        "print(\"⚙️  Training Configuration:\")\n",
+        "for key, value in training_config.items():\n",
+        "    print(f\"  • {key}: {value}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 🔧 Optimizer Configuration\n",
+        "\n",
+        "Configure the optimizer and learning rate."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Optimizer Configuration\n",
+        "optimizer_config = nb.create_optimizer_config(\n",
+        "    name=\"AdamW\",\n",
+        "    lr=1e-5,                 # Learning rate\n",
+        "    eps=1e-8,                # Epsilon\n",
+        "    weight_decay=0.0,        # Weight decay\n",
+        "    betas=(0.9, 0.999)       # Adam betas\n",
+        ")\n",
+        "\n",
+        "# LR Scheduler Configuration\n",
+        "lr_scheduler_config = nb.create_lr_scheduler_config(\n",
+        "    warmup_steps=200,        # Warmup steps\n",
+        "    decay_steps=None,        # Decay steps (None = no decay)\n",
+        "    min_lr=0.0               # Minimum LR\n",
+        ")\n",
+        "\n",
+        "print(\"🔧 Optimizer Configuration:\")\n",
+        "for key, value in optimizer_config.items():\n",
+        "    print(f\"  • {key}: {value}\")\n",
+        "\n",
+        "print(\"\\n📈 LR Scheduler Configuration:\")\n",
+        "for key, value in lr_scheduler_config.items():\n",
+        "    print(f\"  • {key}: {value}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 🔀 Parallelism Configuration\n",
+        "\n",
+        "Configure distributed training strategies.\n",
+        "\n",
+        "### Parallelism Options:\n",
+        "- **Data Parallel (Replicate)**: Basic data parallelism\n",
+        "- **Data Parallel (Shard/FSDP)**: Fully Sharded Data Parallel (-1 = use all GPUs)\n",
+        "- **Tensor Parallel**: Split model across multiple GPUs\n",
+        "- **Pipeline Parallel**: Split model stages across GPUs"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Parallelism Configuration\n",
+        "parallelism_config = nb.create_parallelism_config(\n",
+        "    data_parallel_replicate_degree=1,   # DP replicate\n",
+        "    data_parallel_shard_degree=-1,      # FSDP (-1 = auto, uses all GPUs)\n",
+        "    tensor_parallel_degree=1,           # TP\n",
+        "    pipeline_parallel_degree=1,         # PP\n",
+        "    context_parallel_degree=1,          # CP\n",
+        "    expert_parallel_degree=1,           # EP (for MoE)\n",
+        "    disable_loss_parallel=False\n",
+        ")\n",
+        "\n",
+        "print(\"🔀 Parallelism Configuration:\")\n",
+        "for key, value in parallelism_config.items():\n",
+        "    print(f\"  • {key}: {value}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 💾 Checkpoint Configuration\n",
+        "\n",
+        "Configure model checkpointing."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Checkpoint Configuration\n",
+        "checkpoint_config = nb.create_checkpoint_config(\n",
+        "    enable=True,\n",
+        "    folder=\"/mnt/home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct/saved_checkpoints\",\n",
+        "    initial_load_path=\"/mnt/home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct/\",\n",
+        "    initial_load_in_hf=True,\n",
+        "    last_save_in_hf=True,\n",
+        "    interval=500,            # Save every N steps\n",
+        "    async_mode=\"disabled\"\n",
+        ")\n",
+        "\n",
+        "# Activation Checkpoint Configuration (for memory efficiency)\n",
+        "activation_checkpoint_config = nb.create_activation_checkpoint_config(\n",
+        "    mode=\"selective\",        # 'selective', 'full', or 'none'\n",
+        "    selective_ac_option=\"op\" # 'op' or 'layer'\n",
+        ")\n",
+        "\n",
+        "print(\"💾 Checkpoint Configuration:\")\n",
+        "for key, value in checkpoint_config.items():\n",
+        "    print(f\"  • {key}: {value}\")\n",
+        "\n",
+        "print(\"\\n🔄 Activation Checkpoint Configuration:\")\n",
+        "for key, value in activation_checkpoint_config.items():\n",
+        "    print(f\"  • {key}: {value}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 🖥️ Resource Configuration\n",
+        "\n",
+        "Configure compute resources.\n",
+        "\n",
+        "### Options:\n",
+        "- **Single Node**: Set only `procs` (number of GPUs)\n",
+        "- **Multi Node**: Set both `hosts` (number of nodes) and `procs` (GPUs per node)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Choose ONE of the following:\n",
+        "\n",
+        "# Option 1: Single Node (8 GPUs)\n",
+        "process_config = nb.create_process_config(\n",
+        "    procs=8,\n",
+        "    with_gpus=True,\n",
+        "    hosts=None  # None = single node\n",
+        ")\n",
+        "\n",
+        "# Option 2: Multi-Node (4 nodes × 8 GPUs = 32 total)\n",
+        "# Uncomment to use:\n",
+        "# process_config = nb.create_process_config(\n",
+        "#     procs=8,\n",
+        "#     with_gpus=True,\n",
+        "#     hosts=4\n",
+        "# )\n",
+        "\n",
+        "print(\"🖥️  Resource Configuration:\")\n",
+        "for key, value in process_config.items():\n",
+        "    print(f\"  • {key}: {value}\")\n",
+        "\n",
+        "if \"hosts\" in process_config and process_config[\"hosts\"]:\n",
+        "    total_gpus = process_config[\"hosts\"] * process_config[\"procs\"]\n",
+        "    print(f\"\\n📊 Total GPUs: {total_gpus}\")\n",
+        "else:\n",
+        "    print(f\"\\n📊 Total GPUs: {process_config['procs']}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## ☁️ Provisioner Configuration (Optional)\n",
+        "\n",
+        "**Only needed for multi-node training on SLURM clusters.**\n",
+        "\n",
+        "⚠️ Skip this cell if you're running single-node training!"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Provisioner Configuration (OPTIONAL - for multi-node only)\n",
+        "# Set to None for single-node training\n",
+        "\n",
+        "provisioner_config = None  # Default: no provisioner\n",
+        "\n",
+        "# Uncomment and configure for SLURM multi-node training:\n",
+        "# provisioner_config = nb.create_provisioner_config(\n",
+        "#     launcher=\"slurm\",\n",
+        "#     job_name=\"sft_training\",\n",
+        "#     partition=\"your_gpu_partition\",  # REQUIRED for SLURM\n",
+        "#     time=\"24:00:00\",                  # REQUIRED for SLURM\n",
+        "#     account=\"your_account\"            # May be required\n",
+        "# )\n",
+        "\n",
+        "if provisioner_config:\n",
+        "    print(\"☁️  Provisioner Configuration:\")\n",
+        "    for key, value in provisioner_config.items():\n",
+        "        print(f\"  • {key}: {value}\")\n",
+        "else:\n",
+        "    print(\"☁️  Provisioner: Disabled (single-node mode)\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 🔨 Build Complete Configuration\n",
+        "\n",
+        "Combine all configurations into a single config object."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Build complete configuration\n",
+        "config = nb.build_config(\n",
+        "    model_config=model_config,\n",
+        "    optimizer_config=optimizer_config,\n",
+        "    lr_scheduler_config=lr_scheduler_config,\n",
+        "    training_config=training_config,\n",
+        "    parallelism_config=parallelism_config,\n",
+        "    checkpoint_config=checkpoint_config,\n",
+        "    activation_checkpoint_config=activation_checkpoint_config,\n",
+        "    process_config=process_config,\n",
+        "    provisioner_config=provisioner_config\n",
+        ")\n",
+        "\n",
+        "print(\"✅ Configuration built successfully!\\n\")\n",
+        "\n",
+        "# Display summary\n",
+        "nb.summarize_config(config)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 📄 View Full Configuration (YAML)\n",
+        "\n",
+        "See the complete configuration in YAML format."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Print full configuration\n",
+        "nb.print_config(config, title=\"Complete Training Configuration\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 💾 Save Configuration (Optional)\n",
+        "\n",
+        "Save the configuration to a YAML file for later use."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from omegaconf import OmegaConf\n",
+        "\n",
+        "# Save configuration\n",
+        "config_path = \"/home/hosseinkh/forge/apps/sft_v2/my_training_config.yaml\"\n",
+        "with open(config_path, 'w') as f:\n",
+        "    OmegaConf.save(config, f)\n",
+        "\n",
+        "print(f\"✅ Configuration saved to: {config_path}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 🚀 Run Training!\n",
+        "\n",
+        "Start the training process with the configured settings.\n",
+        "\n",
+        "⚠️ **Note**: This will start actual training and may take a long time!"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Run training\n",
+        "print(\"🚀 Starting training...\\n\")\n",
+        "\n",
+        "try:\n",
+        "    nb.train(config)\n",
+        "    print(\"\\n✅ Training completed successfully!\")\n",
+        "except Exception as e:\n",
+        "    print(f\"\\n❌ Training failed: {e}\")\n",
+        "    import traceback\n",
+        "    traceback.print_exc()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 🔍 Advanced: Step-by-Step Execution\n",
+        "\n",
+        "For more control, you can run each training stage separately.\n",
+        "\n",
+        "⚠️ **Only run this section if you want manual control. Otherwise, use the cell above.**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Step 1: Initialize provisioner (if configured)\n",
+        "import asyncio\n",
+        "\n",
+        "provisioner_initialized = await nb.initialize_provisioner(config)\n",
+        "print(f\"Provisioner initialized: {provisioner_initialized}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Step 2: Create recipe\n",
+        "recipe = await nb.create_recipe(config)\n",
+        "print(\"Recipe created\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Step 3: Setup recipe (load model, data, etc.)\n",
+        "await nb.setup_recipe(recipe)\n",
+        "print(\"Recipe setup complete\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Step 4: Run training\n",
+        "await nb.train_recipe(recipe)\n",
+        "print(\"Training complete\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Step 5: Cleanup\n",
+        "await nb.cleanup_recipe(recipe)\n",
+        "print(\"Cleanup complete\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Step 6: Shutdown provisioner (if initialized)\n",
+        "if provisioner_initialized:\n",
+        "    await nb.shutdown_provisioner(config)\n",
+        "    print(\"Provisioner shutdown complete\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 📊 Tips & Tricks\n",
+        "\n",
+        "### Memory Optimization\n",
+        "- Use **FSDP** (set `data_parallel_shard_degree=-1`) for large models\n",
+        "- Enable **activation checkpointing** (set `mode=\"selective\"` or `\"full\"`)\n",
+        "- Reduce **batch size** or **sequence length**\n",
+        "\n",
+        "### Speed Optimization\n",
+        "- Use **tensor parallelism** for large models (set `tensor_parallel_degree > 1`)\n",
+        "- Enable **compilation** (set `compile=True`)\n",
+        "- Increase **batch size** if memory allows\n",
+        "\n",
+        "### Multi-Node Training\n",
+        "- Set `hosts` in process config\n",
+        "- Configure provisioner with SLURM details\n",
+        "- Make sure model path is accessible on all nodes\n",
+        "\n",
+        "### Debugging\n",
+        "- Start with fewer steps (e.g., `steps=10`)\n",
+        "- Use single GPU first (`procs=1`)\n",
+        "- Check logs for errors"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 🎯 Common Configurations\n",
+        "\n",
+        "### Quick Test Run\n",
+        "```python\n",
+        "training_config = nb.create_training_config(\n",
+        "    steps=10,\n",
+        "    local_batch_size=1\n",
+        ")\n",
+        "process_config = nb.create_process_config(procs=1)\n",
+        "```\n",
+        "\n",
+        "### Single Node, 8 GPUs, FSDP\n",
+        "```python\n",
+        "parallelism_config = nb.create_parallelism_config(\n",
+        "    data_parallel_shard_degree=-1  # Use all 8 GPUs with FSDP\n",
+        ")\n",
+        "process_config = nb.create_process_config(procs=8)\n",
+        "```\n",
+        "\n",
+        "### Multi-Node, 4×8 GPUs, TP=2\n",
+        "```python\n",
+        "parallelism_config = nb.create_parallelism_config(\n",
+        "    data_parallel_shard_degree=16,   # 32 GPUs / 2 TP = 16 FSDP\n",
+        "    tensor_parallel_degree=2\n",
+        ")\n",
+        "process_config = nb.create_process_config(procs=8, hosts=4)\n",
+        "provisioner_config = nb.create_provisioner_config(\n",
+        "    launcher=\"slurm\",\n",
+        "    partition=\"gpu_partition\"\n",
+        ")\n",
+        "```"
+      ]
+    }
+  ],
+  "metadata": {
+    "orig_nbformat": 4
+  },
+  "nbformat": 4,
+  "nbformat_minor": 2
+}

From baeb35b26a4ee495cabfaed5391c0a49e9807733 Mon Sep 17 00:00:00 2001
From: Hossein Kavianihamedani <hosseinkh@fb.com>
Date: Thu, 9 Oct 2025 13:29:23 -0700
Subject: [PATCH 2/7] Submitting an interactive notebook to run SFT

---
 apps/sft_v2/NOTEBOOK_GUIDE.md                 | 847 ++++++++++++++++++
 apps/sft_v2/README_NOTEBOOK.md                | 435 ---------
 apps/sft_v2/actor.py                          | 133 +++
 apps/sft_v2/interactive_config_notebook.ipynb | 629 +++++++++++++
 apps/sft_v2/notebook_utils.py                 | 463 ----------
 apps/sft_v2/sft_training_notebook.ipynb       | 568 ------------
 apps/sft_v2/spawn_actor.py                    | 139 +++
 apps/sft_v2/trainer_actor.py                  | 189 ++++
 apps/sft_v2/utils.py                          | 187 ++++
 9 files changed, 2124 insertions(+), 1466 deletions(-)
 create mode 100644 apps/sft_v2/NOTEBOOK_GUIDE.md
 delete mode 100644 apps/sft_v2/README_NOTEBOOK.md
 create mode 100644 apps/sft_v2/actor.py
 create mode 100644 apps/sft_v2/interactive_config_notebook.ipynb
 delete mode 100644 apps/sft_v2/notebook_utils.py
 delete mode 100644 apps/sft_v2/sft_training_notebook.ipynb
 create mode 100644 apps/sft_v2/spawn_actor.py
 create mode 100644 apps/sft_v2/trainer_actor.py
 create mode 100644 apps/sft_v2/utils.py

diff --git a/apps/sft_v2/NOTEBOOK_GUIDE.md b/apps/sft_v2/NOTEBOOK_GUIDE.md
new file mode 100644
index 000000000..b3524ed31
--- /dev/null
+++ b/apps/sft_v2/NOTEBOOK_GUIDE.md
@@ -0,0 +1,847 @@
+# Complete Guide: Interactive Configuration Notebook
+
+This guide explains step-by-step how to use the interactive configuration notebook for SFT training.
+
+---
+
+## 📖 Table of Contents
+
+1. [Overview](#overview)
+2. [Architecture Components](#architecture-components)
+3. [Notebook Step-by-Step](#notebook-step-by-step)
+4. [Utility Functions Explained](#utility-functions-explained)
+5. [How to Run](#how-to-run)
+6. [Common Scenarios](#common-scenarios)
+7. [Troubleshooting](#troubleshooting)
+
+---
+
+## Overview
+
+The interactive configuration notebook (`interactive_config_notebook.ipynb`) allows you to:
+- Configure SFT training **without YAML files**
+- Define configuration interactively in separate cells
+- Easily modify parameters and experiment
+- Use pre-built templates for common scenarios
+
+### What Problem Does This Solve?
+
+**Before**: You had to edit YAML files, which required:
+- External file management
+- Reloading files after changes
+- Difficult to experiment with different configs
+
+**After**: You can:
+- Define everything in the notebook
+- Change values in cells and re-run
+- See all configurations clearly
+- No external file management needed
+
+---
+
+## Architecture Components
+
+Before diving into the notebook, let's understand the components:
+
+### 1. BaseForgeActor (`actor.py`)
+
+**What it is**: An abstract base class that defines the contract for all actors.
+
+**What it does**:
+- Handles distributed initialization (sets up multi-GPU environment)
+- Manages common attributes (model, optimizer, checkpointer, etc.)
+- Defines three required methods that subclasses must implement:
+  - `setup()` - Initialize data, checkpoints, etc.
+  - `run()` - Main execution logic
+  - `cleanup()` - Resource cleanup
+
+**Why it matters**: Provides a consistent interface for different actor types (Trainer, Evaluator, Inferencer, etc.)
+
+### 2. TrainerActor (`trainer_actor.py`)
+
+**What it is**: A concrete implementation of BaseForgeActor for training.
+
+**What it does**:
+- Implements the training loop
+- Handles forward/backward passes
+- Manages checkpointing
+- Supports various parallelism strategies (FSDP, Pipeline Parallel, Tensor Parallel)
+
+**Key Methods**:
+- `setup()` - Loads tokenizer, dataset, and checkpoints
+- `run()` - Executes the training loop
+- `forward_backward()` - Performs forward and backward passes
+- `train_step()` - Single training step
+- `cleanup()` - Closes resources
+
+### 3. SpawnActor (`spawn_actor.py`)
+
+**What it is**: An orchestrator that manages actor lifecycle.
+
+**What it does**:
+- Creates actor instances
+- Manages the lifecycle: spawn → setup → run → cleanup
+- Provides error handling and cleanup guarantees
+
+**Key Methods**:
+- `spawn()` - Creates the actor instance
+- `setup()` - Calls actor's setup
+- `run()` - Calls actor's run
+- `cleanup()` - Calls actor's cleanup and stops the mesh
+- `run_full_lifecycle()` - Executes all phases automatically
+
+**Why it matters**: Simplifies actor management and ensures proper resource cleanup.
+
+### 4. Utility Functions (`utils.py`)
+
+Helper functions for common operations. See [Utility Functions Explained](#utility-functions-explained) section below.
+
+---
+
+## Notebook Step-by-Step
+
+### Step 1: Import Dependencies
+
+```python
+import asyncio
+import logging
+from omegaconf import OmegaConf, DictConfig
+
+from forge.apps.sft_v2.trainer_actor import TrainerActor
+from forge.apps.sft_v2.spawn_actor import SpawnActor, run_actor
+```
+
+**What this does**:
+- `asyncio` - For async/await operations (actors run asynchronously)
+- `logging` - For logging training progress
+- `OmegaConf` - For managing configurations (converts dicts to config objects)
+- `TrainerActor` - The training actor we'll use
+- `SpawnActor`, `run_actor` - For managing actor lifecycle
+
+**Why we need it**: These are the core dependencies for running the actor-based training.
+
+---
+
+### Step 2: Configure Model Settings
+
+```python
+model_config = {
+    "name": "llama3",
+    "flavor": "8B",
+    "hf_assets_path": "/tmp/Meta-Llama-3.1-8B-Instruct"
+}
+```
+
+**What this does**:
+- `name` - Model architecture type (e.g., "llama3", "llama2")
+- `flavor` - Model size (e.g., "8B", "70B", "405B")
+- `hf_assets_path` - Path to the model files (tokenizer, weights, config)
+
+**How to modify**:
+- Change `flavor` to use different model sizes
+- Update `hf_assets_path` to point to your model location
+- Make sure the path contains `tokenizer.json`, `tokenizer_config.json`, and model weights
+
+**Example variations**:
+```python
+# For a 70B model
+model_config = {
+    "name": "llama3",
+    "flavor": "70B",
+    "hf_assets_path": "/path/to/Meta-Llama-3.1-70B"
+}
+```
+
+---
+
+### Step 3: Configure Process Settings
+
+```python
+processes_config = {
+    "procs": 8,        # Number of processes
+    "with_gpus": True  # Use GPUs
+}
+```
+
+**What this does**:
+- `procs` - Number of parallel processes (usually = number of GPUs)
+- `with_gpus` - Whether to use GPUs or CPUs
+
+**How to modify**:
+- For single GPU: `"procs": 1`
+- For 4 GPUs: `"procs": 4`
+- For CPU training: `"with_gpus": False` (not recommended for LLMs)
+
+**Important**: Set `procs` to match your available GPUs!
+
+---
+
+### Step 4: Configure Optimizer Settings
+
+```python
+optimizer_config = {
+    "name": "AdamW",
+    "lr": 1e-5,    # Learning rate
+    "eps": 1e-8
+}
+```
+
+**What this does**:
+- `name` - Optimizer type (AdamW is recommended for LLMs)
+- `lr` - Learning rate (how fast the model learns)
+- `eps` - Epsilon for numerical stability
+
+**How to modify**:
+- **Lower learning rate** (e.g., `1e-6`) for fine-tuning
+- **Higher learning rate** (e.g., `5e-5`) for pre-training (use with caution)
+- Typical range for fine-tuning: `1e-6` to `1e-4`
+
+**Tips**:
+- Start conservative with `1e-5` or `2e-5`
+- If loss explodes, reduce learning rate
+- If training is too slow, slightly increase learning rate
+
+---
+
+### Step 5: Configure Learning Rate Scheduler
+
+```python
+lr_scheduler_config = {
+    "warmup_steps": 200  # Number of warmup steps
+}
+```
+
+**What this does**:
+- `warmup_steps` - Number of steps to gradually increase learning rate from 0 to `lr`
+
+**Why warmup**: Prevents training instability at the beginning by starting with a low learning rate.
+
+**How to modify**:
+- For short training (< 1000 steps): use 10-50 warmup steps
+- For medium training (1000-5000 steps): use 100-200 warmup steps
+- For long training (> 5000 steps): use 200-500 warmup steps
+- Rule of thumb: ~5-10% of total training steps
+
+---
+
+### Step 6: Configure Training Settings
+
+```python
+training_config = {
+    "local_batch_size": 1,  # Batch size per GPU
+    "seq_len": 2048,         # Sequence length
+    "max_norm": 1.0,         # Gradient clipping
+    "steps": 1000,           # Total training steps
+    "compile": False,        # PyTorch compilation
+    "dataset": "c4"          # Dataset name
+}
+```
+
+**What this does**:
+- `local_batch_size` - Number of samples per GPU per step
+- `seq_len` - Maximum sequence length (in tokens)
+- `max_norm` - Gradient clipping threshold (prevents exploding gradients)
+- `steps` - Total number of training steps
+- `compile` - Enable PyTorch 2.0 compilation (experimental)
+- `dataset` - Dataset identifier
+
+**How to modify**:
+
+**For Memory Issues**:
+- Reduce `seq_len` (e.g., from 2048 to 1024)
+- Reduce `local_batch_size` (e.g., from 2 to 1)
+- Both reduce memory usage
+
+**For Faster Training**:
+- Increase `local_batch_size` if you have memory
+- Use shorter `seq_len` for tasks that don't need long context
+
+**For Quick Testing**:
+- Set `steps` to 10-100 for quick validation
+
+**Global batch size** = `local_batch_size` × `procs` × `data_parallel_shard_degree`
+
+---
+
+### Step 7: Configure Parallelism Settings
+
+```python
+parallelism_config = {
+    "data_parallel_replicate_degree": 1,
+    "data_parallel_shard_degree": -1,  # -1 = use all GPUs for FSDP
+    "tensor_parallel_degree": 1,
+    "pipeline_parallel_degree": 1,
+    "context_parallel_degree": 1,
+    "expert_parallel_degree": 1,
+    "disable_loss_parallel": False
+}
+```
+
+**What this does**:
+
+- **Data Parallel Shard Degree (FSDP)**: Splits model parameters across GPUs
+  - `-1` means use all available GPUs
+  - `8` means split across 8 GPUs
+  - Most common strategy for fine-tuning
+
+- **Tensor Parallel Degree**: Splits individual layers across GPUs
+  - Use for very large models that don't fit on single GPU even with FSDP
+  - `1` means no tensor parallelism
+
+- **Pipeline Parallel Degree**: Splits model into sequential stages
+  - Use for extremely large models
+  - `1` means no pipeline parallelism
+
+- **Context Parallel Degree**: Splits sequence dimension
+  - For very long sequences
+  - `1` means no context parallelism
+
+**Common Configurations**:
+
+**Single GPU**:
+```python
+"data_parallel_shard_degree": 1
+```
+
+**8 GPUs with FSDP (recommended)**:
+```python
+"data_parallel_shard_degree": -1  # or 8
+```
+
+**Large Model (70B+) with Tensor Parallelism**:
+```python
+"data_parallel_shard_degree": 4,
+"tensor_parallel_degree": 2
+```
+
+---
+
+### Step 8: Configure Checkpoint Settings
+
+```python
+checkpoint_config = {
+    "enable": True,
+    "folder": "/tmp/Meta-Llama-3.1-8B-Instruct/saved_checkpoints",
+    "initial_load_path": "/tmp/Meta-Llama-3.1-8B-Instruct/",
+    "initial_load_in_hf": True,
+    "last_save_in_hf": True,
+    "interval": 500,           # Save every N steps
+    "async_mode": "disabled"
+}
+```
+
+**What this does**:
+- `enable` - Whether to enable checkpointing
+- `folder` - Where to save checkpoints
+- `initial_load_path` - Where to load initial weights from
+- `initial_load_in_hf` - Load weights in HuggingFace format
+- `last_save_in_hf` - Save final checkpoint in HuggingFace format
+- `interval` - How often to save (in steps)
+- `async_mode` - Async saving mode (use "disabled" for simplicity)
+
+**How to modify**:
+- **Save more frequently**: Reduce `interval` (e.g., 100)
+- **Save less frequently**: Increase `interval` (e.g., 1000)
+- **Resume training**: Point `initial_load_path` to your checkpoint folder
+
+**Important**: Make sure `folder` path exists and has enough disk space!
+
+---
+
+### Step 9: Configure Activation Checkpointing
+
+```python
+activation_checkpoint_config = {
+    "mode": "selective",
+    "selective_ac_option": "op"
+}
+```
+
+**What this does**:
+- Saves memory by recomputing activations during backward pass instead of storing them
+- `mode` - Checkpointing mode ("selective" or "full")
+- `selective_ac_option` - Which operations to checkpoint
+
+**Memory vs Speed Trade-off**:
+- **Activation checkpointing ON**: Lower memory, slower training
+- **Activation checkpointing OFF**: Higher memory, faster training
+
+**When to use**: Enable when running out of memory.
+
+---
+
+### Step 10: Configure Communication Settings
+
+```python
+comm_config = {
+    "trace_buf_size": 0
+}
+```
+
+**What this does**:
+- Configuration for distributed communication (required by TorchTitan)
+- Usually you don't need to modify this
+
+---
+
+### Step 11: Combine All Configurations
+
+```python
+complete_config = {
+    "comm": comm_config,
+    "model": model_config,
+    "processes": processes_config,
+    "optimizer": optimizer_config,
+    "lr_scheduler": lr_scheduler_config,
+    "training": training_config,
+    "parallelism": parallelism_config,
+    "checkpoint": checkpoint_config,
+    "activation_checkpoint": activation_checkpoint_config
+}
+
+cfg = OmegaConf.create(complete_config)
+```
+
+**What this does**:
+- Combines all configuration sections into one complete config
+- Converts to OmegaConf format (allows dot notation access)
+
+**Prints**: The complete configuration in YAML format for review
+
+---
+
+### Step 12: Run Training (Simple Way)
+
+```python
+await run_actor(TrainerActor, cfg)
+```
+
+**What this does**:
+- Spawns the trainer actor
+- Runs setup (loads data, model, checkpoints)
+- Runs training loop
+- Cleans up resources
+- All in one line!
+
+**When to use**: When you want fully automatic training with no manual intervention.
+
+---
+
+### Alternative: Manual Lifecycle Control
+
+For more control over the training process:
+
+#### Create and Spawn the Actor
+
+```python
+spawner = SpawnActor(TrainerActor, cfg)
+actor = await spawner.spawn()
+```
+
+**What this does**:
+- Creates a spawner with your config
+- Spawns the actor instance (allocates resources, initializes distributed environment)
+
+#### Setup the Actor
+
+```python
+await spawner.setup()
+```
+
+**What this does**:
+- Loads tokenizer from `hf_assets_path`
+- Loads training dataset
+- Initializes model
+- Loads checkpoint if specified
+
+**At this point**: You could inspect the actor state before training:
+```python
+print(f"Current step: {actor.current_step}")
+print(f"Device: {actor.device}")
+```
+
+#### Run Training
+
+```python
+await spawner.run()
+```
+
+**What this does**:
+- Executes the training loop
+- Iterates through batches
+- Performs forward/backward passes
+- Updates weights
+- Saves checkpoints at intervals
+
+#### Cleanup
+
+```python
+await spawner.cleanup()
+```
+
+**What this does**:
+- Closes checkpointer
+- Closes logger
+- Stops the actor mesh
+- Frees resources
+
+**When to use manual control**:
+- When you want to inspect state between phases
+- When you want to modify configuration between setup and run
+- For debugging purposes
+
+---
+
+## Utility Functions Explained
+
+The `utils.py` module provides reusable helper functions:
+
+### 1. `setup_tokenizer()`
+
+```python
+def setup_tokenizer(
+    hf_assets_path: str,
+    tokenizer_filename: str = "tokenizer.json",
+    tokenizer_config_filename: str = "tokenizer_config.json",
+    generation_config_filename: str = "generation_config.json",
+) -> HuggingFaceModelTokenizer
+```
+
+**What it does**:
+- Loads a HuggingFace tokenizer from the model assets directory
+- Initializes tokenizer with config and generation settings
+
+**Parameters**:
+- `hf_assets_path` - Path to directory containing tokenizer files
+- Other parameters are filenames (usually don't need to change)
+
+**Returns**: Initialized `HuggingFaceModelTokenizer` object
+
+**Example**:
+```python
+tokenizer = setup_tokenizer("/tmp/Meta-Llama-3.1-8B-Instruct")
+```
+
+**When to use**: If you need to use the tokenizer independently (e.g., for preprocessing data)
+
+---
+
+### 2. `setup_sft_dataloader()`
+
+```python
+def setup_sft_dataloader(
+    tokenizer: HuggingFaceModelTokenizer,
+    dataset_path: str,
+    dataset_split: str,
+    target_tokens_per_pack: int,
+    batch_size: int,
+    device: torch.device,
+    padding_idx: int = 0,
+    message_transform: Optional[Any] = None,
+) -> StatefulDataLoader
+```
+
+**What it does**:
+- Creates a dataloader for supervised fine-tuning
+- Handles data loading, tokenization, and packing
+- Returns a StatefulDataLoader (can save/restore state for checkpointing)
+
+**Parameters**:
+- `tokenizer` - Tokenizer to use for text processing
+- `dataset_path` - HuggingFace dataset name (e.g., "yahma/alpaca-cleaned")
+- `dataset_split` - Which split to use ("train", "validation", "test")
+- `target_tokens_per_pack` - Sequence length (same as `seq_len` in config)
+- `batch_size` - Batch size (same as `local_batch_size` in config)
+- `device` - Which device to move tensors to
+- `padding_idx` - Token ID for padding (usually 0)
+- `message_transform` - Transform to convert dataset format (default: AlpacaToMessages)
+
+**Returns**: Configured `StatefulDataLoader`
+
+**Example**:
+```python
+dataloader = setup_sft_dataloader(
+    tokenizer=tokenizer,
+    dataset_path="yahma/alpaca-cleaned",
+    dataset_split="train",
+    target_tokens_per_pack=2048,
+    batch_size=4,
+    device=torch.device("cuda"),
+)
+```
+
+**When to use**: If you want to create a custom dataloader outside of TrainerActor
+
+---
+
+### 3. `create_context_parallel_context()`
+
+```python
+def create_context_parallel_context(
+    parallel_dims: ParallelDims,
+    inputs: torch.Tensor,
+    labels: torch.Tensor,
+    model_parts: list,
+    rotate_method: str,
+)
+```
+
+**What it does**:
+- Creates context for context parallelism (splits sequence across GPUs)
+- Returns None if context parallelism is disabled
+
+**Parameters**:
+- `parallel_dims` - Parallel dimensions configuration
+- `inputs` - Input tensor
+- `labels` - Label tensor
+- `model_parts` - List of model parts
+- `rotate_method` - Rotation method for context parallel
+
+**Returns**: Context parallel context or None
+
+**When to use**: Internally used by TrainerActor. You rarely need to call this directly.
+
+---
+
+### 4. `move_batch_to_device()`
+
+```python
+def move_batch_to_device(batch: dict[str, Any], device: torch.device) -> dict[str, Any]
+```
+
+**What it does**:
+- Moves all tensors in a batch dictionary to the specified device
+- Leaves non-tensor values unchanged
+
+**Parameters**:
+- `batch` - Dictionary containing batch data
+- `device` - Target device (e.g., `torch.device("cuda")`)
+
+**Returns**: Batch with tensors moved to device
+
+**Example**:
+```python
+batch = {"tokens": tensor, "labels": tensor, "metadata": "some_string"}
+batch = move_batch_to_device(batch, torch.device("cuda"))
+```
+
+**When to use**: Useful when manually processing batches
+
+---
+
+### 5. `log_training_step()`
+
+```python
+def log_training_step(
+    step: int,
+    total_steps: int,
+    loss: torch.Tensor,
+    logger: logging.Logger,
+)
+```
+
+**What it does**:
+- Logs training progress in a formatted way
+- Shows current step, total steps, and loss value
+
+**Parameters**:
+- `step` - Current training step
+- `total_steps` - Total number of training steps
+- `loss` - Current loss tensor
+- `logger` - Logger instance
+
+**Example output**:
+```
+Step 100/1000 | Loss: 2.3456
+```
+
+**When to use**: Internally used by TrainerActor. You can use it for custom logging.
+
+---
+
+## How to Run
+
+### Prerequisites
+
+1. **Download Model**:
+```bash
+export HF_HUB_DISABLE_XET=1
+forge download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir /tmp/Meta-Llama-3.1-8B-Instruct
+```
+
+2. **Check GPU Availability**:
+```bash
+nvidia-smi  # Should show your GPUs
+```
+
+### Running the Notebook
+
+#### Option 1: Using Jupyter Notebook
+
+1. **Start Jupyter**:
+```bash
+cd /home/hosseinkh/TorchForge/forge
+jupyter notebook
+```
+
+2. **Open the notebook**:
+   - Navigate to `apps/sft_v2/interactive_config_notebook.ipynb`
+   - Click to open
+
+3. **Run cells sequentially**:
+   - Click on first cell, press `Shift + Enter` to run
+   - Continue through all cells
+   - Modify configuration cells as needed
+   - Run Step 12 to start training
+
+#### Option 2: Using VS Code
+
+1. **Open notebook in VS Code**:
+   - File → Open → `interactive_config_notebook.ipynb`
+
+2. **Select Python kernel**:
+   - Click "Select Kernel" in top right
+   - Choose your Python environment
+
+3. **Run cells**:
+   - Click "Run Cell" button on each cell
+   - Or press `Shift + Enter`
+
+#### Option 3: Using Command Line (with simplified entry point)
+
+```bash
+cd /home/hosseinkh/TorchForge/forge
+python -m apps.sft_v2.notebook_main --config apps/sft_v2/llama3_8b.yaml
+```
+
+Note: This uses a YAML file, but you can use the notebook for interactive config.
+
+---
+
+## Common Scenarios
+
+### Scenario 1: Quick Test (1 GPU, 100 steps)
+
+```python
+# Modify these cells:
+processes_config = {"procs": 1, "with_gpus": True}
+training_config = {
+    "local_batch_size": 1,
+    "seq_len": 1024,
+    "steps": 100,  # Just 100 steps
+    ...
+}
+```
+
+**Expected time**: 5-10 minutes on A100
+
+### Scenario 2: Full Training (8 GPUs, 5000 steps)
+
+```python
+processes_config = {"procs": 8, "with_gpus": True}
+training_config = {
+    "local_batch_size": 2,
+    "seq_len": 2048,
+    "steps": 5000,
+    ...
+}
+parallelism_config = {
+    "data_parallel_shard_degree": -1,  # Use all 8 GPUs
+    ...
+}
+```
+
+**Expected time**: Several hours depending on hardware
+
+### Scenario 3: Memory-Constrained Training
+
+```python
+training_config = {
+    "local_batch_size": 1,  # Small batch
+    "seq_len": 1024,         # Shorter sequence
+    ...
+}
+activation_checkpoint_config = {
+    "mode": "selective",  # Enable AC for memory savings
+    ...
+}
+```
+
+**Use when**: Running out of GPU memory
+
+### Scenario 4: Resume from Checkpoint
+
+```python
+checkpoint_config = {
+    "enable": True,
+    "folder": "/path/to/previous/checkpoints",
+    "initial_load_path": "/path/to/previous/checkpoints/step_1000",
+    "interval": 500,
+    ...
+}
+```
+
+**Use when**: Continuing training from a saved checkpoint
+
+---
+
+## Troubleshooting
+
+### Problem: "CUDA out of memory"
+
+**Solutions**:
+1. Reduce `seq_len` (e.g., from 2048 to 1024)
+2. Reduce `local_batch_size` (e.g., from 2 to 1)
+3. Enable activation checkpointing
+4. Use more GPUs with FSDP
+
+### Problem: "Loss is NaN or exploding"
+
+**Solutions**:
+1. Reduce learning rate (e.g., from `1e-5` to `1e-6`)
+2. Increase gradient clipping (`max_norm` from 1.0 to 0.5)
+3. Increase warmup steps
+
+### Problem: "Training is too slow"
+
+**Solutions**:
+1. Increase `local_batch_size` if memory allows
+2. Use more GPUs
+3. Reduce `seq_len` if your task doesn't need long context
+4. Enable compilation (`compile: True`)
+
+### Problem: "Cannot find tokenizer files"
+
+**Solutions**:
+1. Check `hf_assets_path` is correct
+2. Ensure path contains `tokenizer.json` and `tokenizer_config.json`
+3. Re-download model if files are missing
+
+### Problem: "Actor spawning fails"
+
+**Solutions**:
+1. Check you have enough GPUs for `procs`
+2. Verify CUDA is available (`torch.cuda.is_available()`)
+3. Check no other processes are using GPUs
+
+---
+
+## Summary
+
+**Key Takeaways**:
+
+1. **Interactive Configuration**: Define all settings in notebook cells, no YAML needed
+2. **Step-by-Step**: Configure model, processes, optimizer, training, parallelism, checkpoints separately
+3. **Two Ways to Run**: Simple (`run_actor()`) or manual (lifecycle control)
+4. **Utility Functions**: Helper functions for tokenization, data loading, device management
+5. **Templates Provided**: Quick test, multi-GPU, memory-efficient configs ready to use
+6. **Flexible**: Easy to modify parameters and experiment
+
+**Next Steps**:
+1. Download your model
+2. Open the notebook
+3. Modify configuration cells for your needs
+4. Run Step 12 to start training
+5. Monitor logs for progress
+
+Happy Training! 🚀
diff --git a/apps/sft_v2/README_NOTEBOOK.md b/apps/sft_v2/README_NOTEBOOK.md
deleted file mode 100644
index eb70a29ea..000000000
--- a/apps/sft_v2/README_NOTEBOOK.md
+++ /dev/null
@@ -1,435 +0,0 @@
-# 🚀 SFT Training Notebook Guide
-
-This directory contains an interactive Jupyter notebook experience for training Language Models with Supervised Fine-Tuning (SFT).
-
-## 📁 Files
-
-### Core Files
-- **`sft_training_notebook.ipynb`** - Main Jupyter notebook for interactive training
-- **`notebook_utils.py`** - Utility functions for notebook-based training
-- **`main.py`** - Original command-line training script (unchanged)
-
-### Configuration Files
-- **`llama3_8b.yaml`** - Original single-node config
-- **`llama3_8b_single_node.yaml`** - Single-node config without provisioner
-- **`llama3_8b_slurm_multinode.yaml`** - Multi-node config with SLURM
-- **`llama3_8b_local.yaml`** - Local testing config
-
-## 🎯 Quick Start
-
-### 1. Open the Notebook
-
-```bash
-cd /home/hosseinkh/forge
-jupyter notebook apps/sft_v2/sft_training_notebook.ipynb
-```
-
-Or in VS Code:
-- Open `apps/sft_v2/sft_training_notebook.ipynb`
-- Select Python kernel
-- Run cells sequentially
-
-### 2. Configure Training
-
-The notebook is organized into sections:
-
-1. **📦 Model Configuration** - Choose model and path
-2. **⚙️ Training Configuration** - Set hyperparameters
-3. **🔧 Optimizer Configuration** - Configure optimizer and LR scheduler
-4. **🔀 Parallelism Configuration** - Set distributed training strategy
-5. **💾 Checkpoint Configuration** - Configure checkpointing
-6. **🖥️ Resource Configuration** - Set number of GPUs/nodes
-7. **☁️ Provisioner Configuration** (optional) - For multi-node SLURM
-
-### 3. Run Training
-
-Execute the "Run Training!" cell to start training with your configuration.
-
-## 📚 Using the Utility Library
-
-The `notebook_utils.py` module provides a clean API for training:
-
-### Configuration Builders
-
-```python
-from apps.sft_v2 import notebook_utils as nb
-
-# Create model config
-model_config = nb.create_model_config(
-    name="llama3",
-    flavor="8B",
-    hf_assets_path="/path/to/model"
-)
-
-# Create training config
-training_config = nb.create_training_config(
-    steps=1000,
-    local_batch_size=1,
-    seq_len=2048
-)
-
-# Create optimizer config
-optimizer_config = nb.create_optimizer_config(
-    name="AdamW",
-    lr=1e-5
-)
-
-# ... configure other components
-
-# Build complete config
-config = nb.build_config(
-    model_config=model_config,
-    training_config=training_config,
-    optimizer_config=optimizer_config,
-    # ... other configs
-)
-```
-
-### Training Functions
-
-```python
-# Simple: run everything
-nb.train(config)
-
-# Advanced: step-by-step control
-import asyncio
-
-async def custom_training():
-    # Initialize
-    await nb.initialize_provisioner(config)
-
-    # Create and setup
-    recipe = await nb.create_recipe(config)
-    await nb.setup_recipe(recipe)
-
-    # Train
-    await nb.train_recipe(recipe)
-
-    # Cleanup
-    await nb.cleanup_recipe(recipe)
-    await nb.shutdown_provisioner(config)
-
-asyncio.run(custom_training())
-```
-
-### Display Utilities
-
-```python
-# Print summary
-nb.summarize_config(config)
-
-# Print full YAML
-nb.print_config(config, title="My Config")
-```
-
-## 🔧 Configuration Functions Reference
-
-### Model Configuration
-
-```python
-nb.create_model_config(
-    name: str = "llama3",
-    flavor: str = "8B",
-    hf_assets_path: str = "/tmp/Meta-Llama-3.1-8B-Instruct"
-)
-```
-
-### Training Configuration
-
-```python
-nb.create_training_config(
-    local_batch_size: int = 1,
-    seq_len: int = 2048,
-    max_norm: float = 1.0,
-    steps: int = 1000,
-    dataset: str = "c4",
-    compile: bool = False
-)
-```
-
-### Optimizer Configuration
-
-```python
-nb.create_optimizer_config(
-    name: str = "AdamW",
-    lr: float = 1e-5,
-    eps: float = 1e-8,
-    weight_decay: float = 0.0,
-    betas: tuple = (0.9, 0.999)
-)
-```
-
-### LR Scheduler Configuration
-
-```python
-nb.create_lr_scheduler_config(
-    warmup_steps: int = 200,
-    decay_steps: Optional[int] = None,
-    min_lr: float = 0.0
-)
-```
-
-### Parallelism Configuration
-
-```python
-nb.create_parallelism_config(
-    data_parallel_replicate_degree: int = 1,
-    data_parallel_shard_degree: int = -1,  # -1 = auto (FSDP)
-    tensor_parallel_degree: int = 1,
-    pipeline_parallel_degree: int = 1,
-    context_parallel_degree: int = 1,
-    expert_parallel_degree: int = 1,
-    disable_loss_parallel: bool = False
-)
-```
-
-### Checkpoint Configuration
-
-```python
-nb.create_checkpoint_config(
-    enable: bool = True,
-    folder: str = "/tmp/checkpoints",
-    initial_load_path: Optional[str] = None,
-    initial_load_in_hf: bool = True,
-    last_save_in_hf: bool = True,
-    interval: int = 500,
-    async_mode: str = "disabled"
-)
-```
-
-### Activation Checkpoint Configuration
-
-```python
-nb.create_activation_checkpoint_config(
-    mode: str = "selective",  # 'selective', 'full', 'none'
-    selective_ac_option: str = "op"
-)
-```
-
-### Process Configuration
-
-```python
-# Single node
-nb.create_process_config(
-    procs: int = 8,
-    with_gpus: bool = True,
-    hosts: Optional[int] = None
-)
-
-# Multi-node
-nb.create_process_config(
-    procs: int = 8,
-    with_gpus: bool = True,
-    hosts: int = 4  # 4 nodes
-)
-```
-
-### Provisioner Configuration (Multi-Node Only)
-
-```python
-nb.create_provisioner_config(
-    launcher: str = "slurm",
-    job_name: str = "sft_training",
-    partition: Optional[str] = None,
-    time: Optional[str] = None,
-    account: Optional[str] = None
-)
-```
-
-## 📖 Example Configurations
-
-### Quick Test (Single GPU, 10 steps)
-
-```python
-model_config = nb.create_model_config(
-    name="llama3",
-    flavor="8B",
-    hf_assets_path="/path/to/model"
-)
-
-training_config = nb.create_training_config(
-    steps=10,
-    local_batch_size=1
-)
-
-process_config = nb.create_process_config(procs=1)
-
-# ... configure other components with defaults
-```
-
-### Single Node, 8 GPUs, FSDP
-
-```python
-parallelism_config = nb.create_parallelism_config(
-    data_parallel_shard_degree=-1  # Use all 8 GPUs with FSDP
-)
-
-process_config = nb.create_process_config(procs=8)
-
-# No provisioner needed
-provisioner_config = None
-```
-
-### Multi-Node, 4×8 GPUs, Tensor Parallel
-
-```python
-parallelism_config = nb.create_parallelism_config(
-    data_parallel_shard_degree=16,  # 32 GPUs / 2 TP = 16 FSDP
-    tensor_parallel_degree=2
-)
-
-process_config = nb.create_process_config(
-    procs=8,
-    hosts=4
-)
-
-provisioner_config = nb.create_provisioner_config(
-    launcher="slurm",
-    job_name="sft_multinode",
-    partition="gpu_partition",
-    time="24:00:00"
-)
-```
-
-## 🎓 Advanced Usage
-
-### Custom Training Loop
-
-You can modify the training loop by creating your own recipe class:
-
-```python
-from apps.sft_v2.main import ForgeSFTRecipe
-
-class CustomRecipe(ForgeSFTRecipe):
-    async def train(self):
-        # Custom training logic
-        dataloader = iter(self.train_dataloader)
-
-        for step in range(self.num_training_steps):
-            batch = next(dataloader)
-            # Custom batch processing
-            self.train_step(batch)
-```
-
-### Experiment Tracking
-
-Integrate with your favorite tracking tool:
-
-```python
-import wandb
-
-# Initialize tracking
-wandb.init(project="sft-training", config=config)
-
-# Train
-nb.train(config)
-
-# Log results
-wandb.log({"final_step": config.training.steps})
-```
-
-### Config Variations
-
-Generate multiple configs for hyperparameter sweeps:
-
-```python
-learning_rates = [1e-5, 5e-5, 1e-4]
-configs = []
-
-for lr in learning_rates:
-    optimizer_config = nb.create_optimizer_config(lr=lr)
-    config = nb.build_config(
-        # ... other configs
-        optimizer_config=optimizer_config
-    )
-    configs.append(config)
-
-# Train all configs
-for config in configs:
-    nb.train(config)
-```
-
-## 🔍 Debugging Tips
-
-### Start Simple
-
-1. **Use 1 GPU first**:
-   ```python
-   process_config = nb.create_process_config(procs=1)
-   ```
-
-2. **Run few steps**:
-   ```python
-   training_config = nb.create_training_config(steps=10)
-   ```
-
-3. **Disable compilation**:
-   ```python
-   training_config = nb.create_training_config(compile=False)
-   ```
-
-### Common Issues
-
-**Memory Errors:**
-- Reduce batch size or sequence length
-- Enable FSDP: `data_parallel_shard_degree=-1`
-- Enable activation checkpointing: `mode="selective"` or `"full"`
-
-**Slow Training:**
-- Increase batch size if memory allows
-- Enable compilation: `compile=True`
-- Use tensor parallelism for large models
-
-**Actor Timeout Errors:**
-- Make sure you're not using provisioner config on single node
-- Check SLURM availability with `sinfo`
-- See `TROUBLESHOOTING_MULTINODE.md` for details
-
-## 📦 Saving and Loading Configs
-
-### Save Config
-
-```python
-from omegaconf import OmegaConf
-
-config_path = "my_config.yaml"
-with open(config_path, 'w') as f:
-    OmegaConf.save(config, f)
-```
-
-### Load Config
-
-```python
-from omegaconf import OmegaConf
-
-config = OmegaConf.load("my_config.yaml")
-nb.train(config)
-```
-
-## 🚀 Next Steps
-
-1. **Start with the notebook**: Open `sft_training_notebook.ipynb` and follow along
-2. **Try a test run**: Configure for 10 steps with 1 GPU
-3. **Scale up**: Increase to 8 GPUs with FSDP
-4. **Go multi-node**: Configure SLURM provisioner for cluster training
-
-## 📚 Additional Resources
-
-- **`MULTINODE_SFT_V2_GUIDE.md`** - Detailed guide on multi-node training
-- **`TROUBLESHOOTING_MULTINODE.md`** - Troubleshooting guide for multi-node issues
-- **`main.py`** - Original implementation for reference
-
-## 🤝 Contributing
-
-To add new configuration options:
-
-1. Add a `create_*_config()` function in `notebook_utils.py`
-2. Update `build_config()` to include the new config
-3. Add a new cell in the notebook to configure it
-4. Update this README
-
-## ⚖️ License
-
-Copyright (c) Meta Platforms, Inc. and affiliates.
-
-Licensed under the BSD-style license found in the LICENSE file.
diff --git a/apps/sft_v2/actor.py b/apps/sft_v2/actor.py
new file mode 100644
index 000000000..8607a39c4
--- /dev/null
+++ b/apps/sft_v2/actor.py
@@ -0,0 +1,133 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Abstract Actor class for training/inference actors in Forge.
+
+This provides a base class that can be extended for different types of actors
+(e.g., Trainer, Evaluator, Inferencer, etc.)
+"""
+
+import logging
+import math
+import os
+from abc import ABC, abstractmethod
+from typing import Any, Optional
+
+import torch
+from forge.controller import ForgeActor
+from monarch.actor import current_rank, current_size
+from omegaconf import DictConfig, OmegaConf
+from torch import nn
+from torchtitan.components.loss import LossFunction
+from torchtitan.components.lr_scheduler import LRSchedulersContainer
+from torchtitan.components.optimizer import OptimizersContainer
+from torchtitan.distributed import ParallelDims
+from torchtitan.experiments.forge.engine import ForgeEngine
+from torchtitan.experiments.forge.job_config import ForgeJobConfig
+
+Checkpointer = Any
+Dataloader = Any
+MetricLogger = Any
+Profiler = Any
+Tokenizer = Any
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class BaseForgeActor(ForgeActor, ForgeEngine, ABC):
+    """
+    Abstract base class for Forge actors.
+
+    This class handles common initialization, distributed setup, and provides
+    abstract methods that must be implemented by concrete actor classes.
+    """
+
+    job_config: ForgeJobConfig
+    parallel_dims: ParallelDims
+    model: list[nn.Module]
+    loss_fn: Optional[LossFunction]
+    optimizer: Optional[OptimizersContainer]
+    lr_scheduler: Optional[LRSchedulersContainer]
+    checkpointer: Optional[Checkpointer]
+    tokenizer: Optional[Tokenizer]
+    metric_logger: Optional[MetricLogger]
+    profiler: Optional[Profiler]
+    device: torch.device
+
+    def __init__(self, config: DictConfig):
+        """
+        Initialize the base actor with configuration.
+
+        Args:
+            config: Configuration dictionary containing job settings
+        """
+        job_config = ForgeJobConfig().to_dict()
+        job_config = OmegaConf.merge(job_config, config)
+
+        self.current_step = 0
+        self.metric_logger = None
+        self.gradient_accumulation_steps = 1
+        self._rank = current_rank().rank
+        self._size = math.prod(current_size().values())
+
+        self._init_dist()
+        super().__init__(job_config)
+
+    def _init_dist(self):
+        """
+        Initialize torch distributed environment.
+
+        Sets up environment variables required for distributed training
+        in the Monarch actor framework.
+        """
+        env = {
+            "RANK": str(self._rank),
+            "LOCAL_RANK": str(self._rank),
+            "LOCAL_WORLD_SIZE": str(self._size),
+            "GROUP_RANK": str(self._size),
+            "GROUP_WORLD_SIZE": str(self._size),
+            "ROLE_RANK": str(self._rank),
+            "ROLE_WORLD_SIZE": str(self._size),
+            "ROLE_NAME": "rank",
+            "WORLD_SIZE": str(self._size),
+            "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
+        }
+        os.environ.update(env)
+        logger.info(f"Initialized distributed environment: {env}")
+
+    @abstractmethod
+    async def setup(self):
+        """
+        Setup the actor (load data, checkpoint, etc.).
+
+        This method must be implemented by concrete actor classes.
+        """
+        pass
+
+    @abstractmethod
+    async def run(self):
+        """
+        Main execution logic for the actor.
+
+        This method must be implemented by concrete actor classes.
+        """
+        pass
+
+    @abstractmethod
+    async def cleanup(self):
+        """
+        Cleanup resources (close checkpointer, logger, etc.).
+
+        This method must be implemented by concrete actor classes.
+        """
+        pass
+
+    @abstractmethod
+    def __repr__(self) -> str:
+        """String representation of the actor."""
+        pass
diff --git a/apps/sft_v2/interactive_config_notebook.ipynb b/apps/sft_v2/interactive_config_notebook.ipynb
new file mode 100644
index 000000000..624f6a08a
--- /dev/null
+++ b/apps/sft_v2/interactive_config_notebook.ipynb
@@ -0,0 +1,629 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# SFT Training - Interactive Configuration Notebook\n",
+        "\n",
+        "This notebook allows you to configure and run SFT training **without any YAML files**!\n",
+        "\n",
+        "## Benefits\n",
+        "\n",
+        "✅ No external YAML files needed  \n",
+        "✅ Interactive configuration in separate cells  \n",
+        "✅ Easy to modify and experiment  \n",
+        "✅ All configuration visible in notebook  \n",
+        "✅ Quick templates for common scenarios"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Step 1: Import Dependencies"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import asyncio\n",
+        "import logging\n",
+        "from omegaconf import OmegaConf, DictConfig\n",
+        "\n",
+        "from forge.apps.sft_v2.trainer_actor import TrainerActor\n",
+        "from forge.apps.sft_v2.spawn_actor import SpawnActor, run_actor\n",
+        "\n",
+        "logging.basicConfig(\n",
+        "    level=logging.INFO,\n",
+        "    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Step 2: Configure Model Settings\n",
+        "\n",
+        "Define your model configuration. **Modify these values as needed!**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "model_config = {\n",
+        "    \"name\": \"llama3\",\n",
+        "    \"flavor\": \"8B\",\n",
+        "    \"hf_assets_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct\"\n",
+        "}\n",
+        "\n",
+        "print(\"Model Configuration:\")\n",
+        "print(OmegaConf.to_yaml(OmegaConf.create(model_config)))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Step 3: Configure Process Settings\n",
+        "\n",
+        "Define how many processes to use and whether to use GPUs."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "processes_config = {\n",
+        "    \"procs\": 8,        # Number of processes\n",
+        "    \"with_gpus\": True  # Use GPUs\n",
+        "}\n",
+        "\n",
+        "print(\"Process Configuration:\")\n",
+        "print(OmegaConf.to_yaml(OmegaConf.create(processes_config)))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Step 4: Configure Optimizer Settings"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "optimizer_config = {\n",
+        "    \"name\": \"AdamW\",\n",
+        "    \"lr\": 1e-5,    # Learning rate\n",
+        "    \"eps\": 1e-8\n",
+        "}\n",
+        "\n",
+        "print(\"Optimizer Configuration:\")\n",
+        "print(OmegaConf.to_yaml(OmegaConf.create(optimizer_config)))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Step 5: Configure Learning Rate Scheduler"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "lr_scheduler_config = {\n",
+        "    \"warmup_steps\": 200  # Number of warmup steps\n",
+        "}\n",
+        "\n",
+        "print(\"LR Scheduler Configuration:\")\n",
+        "print(OmegaConf.to_yaml(OmegaConf.create(lr_scheduler_config)))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Step 6: Configure Training Settings\n",
+        "\n",
+        "**Key parameters to adjust for your experiment:**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "training_config = {\n",
+        "    \"local_batch_size\": 1,  # Batch size per GPU\n",
+        "    \"seq_len\": 2048,         # Sequence length\n",
+        "    \"max_norm\": 1.0,         # Gradient clipping\n",
+        "    \"steps\": 1000,           # Total training steps\n",
+        "    \"compile\": False,        # PyTorch compilation\n",
+        "    \"dataset\": \"c4\"          # Dataset name\n",
+        "}\n",
+        "\n",
+        "print(\"Training Configuration:\")\n",
+        "print(OmegaConf.to_yaml(OmegaConf.create(training_config)))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Step 7: Configure Parallelism Settings"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "parallelism_config = {\n",
+        "    \"data_parallel_replicate_degree\": 1,\n",
+        "    \"data_parallel_shard_degree\": -1,  # -1 means use all available GPUs for FSDP\n",
+        "    \"tensor_parallel_degree\": 1,\n",
+        "    \"pipeline_parallel_degree\": 1,\n",
+        "    \"context_parallel_degree\": 1,\n",
+        "    \"expert_parallel_degree\": 1,\n",
+        "    \"disable_loss_parallel\": False\n",
+        "}\n",
+        "\n",
+        "print(\"Parallelism Configuration:\")\n",
+        "print(OmegaConf.to_yaml(OmegaConf.create(parallelism_config)))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Step 8: Configure Checkpoint Settings"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "checkpoint_config = {\n",
+        "    \"enable\": True,\n",
+        "    \"folder\": \"/tmp/Meta-Llama-3.1-8B-Instruct/saved_checkpoints\",\n",
+        "    \"initial_load_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct/\",\n",
+        "    \"initial_load_in_hf\": True,\n",
+        "    \"last_save_in_hf\": True,\n",
+        "    \"interval\": 500,           # Save every N steps\n",
+        "    \"async_mode\": \"disabled\"\n",
+        "}\n",
+        "\n",
+        "print(\"Checkpoint Configuration:\")\n",
+        "print(OmegaConf.to_yaml(OmegaConf.create(checkpoint_config)))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Step 9: Configure Activation Checkpointing"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "activation_checkpoint_config = {\n",
+        "    \"mode\": \"selective\",\n",
+        "    \"selective_ac_option\": \"op\"\n",
+        "}\n",
+        "\n",
+        "print(\"Activation Checkpoint Configuration:\")\n",
+        "print(OmegaConf.to_yaml(OmegaConf.create(activation_checkpoint_config)))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Step 10: Configure Communication Settings"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "comm_config = {\n",
+        "    \"trace_buf_size\": 0\n",
+        "}\n",
+        "\n",
+        "print(\"Communication Configuration:\")\n",
+        "print(OmegaConf.to_yaml(OmegaConf.create(comm_config)))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Step 11: Combine All Configurations\n",
+        "\n",
+        "Now let's merge everything into a complete configuration!"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Combine all configs\n",
+        "complete_config = {\n",
+        "    \"comm\": comm_config,\n",
+        "    \"model\": model_config,\n",
+        "    \"processes\": processes_config,\n",
+        "    \"optimizer\": optimizer_config,\n",
+        "    \"lr_scheduler\": lr_scheduler_config,\n",
+        "    \"training\": training_config,\n",
+        "    \"parallelism\": parallelism_config,\n",
+        "    \"checkpoint\": checkpoint_config,\n",
+        "    \"activation_checkpoint\": activation_checkpoint_config\n",
+        "}\n",
+        "\n",
+        "# Create OmegaConf DictConfig\n",
+        "cfg = OmegaConf.create(complete_config)\n",
+        "\n",
+        "print(\"=\" * 80)\n",
+        "print(\"COMPLETE CONFIGURATION\")\n",
+        "print(\"=\" * 80)\n",
+        "print(OmegaConf.to_yaml(cfg))\n",
+        "print(\"=\" * 80)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Step 12: Run Training (Simple Way)\n",
+        "\n",
+        "The simplest way - automatic lifecycle management!"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Run training with automatic lifecycle management\n",
+        "await run_actor(TrainerActor, cfg)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Alternative: Manual Lifecycle Control\n",
+        "\n",
+        "For more control, manage each phase separately.\n",
+        "\n",
+        "### Create and Spawn the Actor"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Create the spawner\n",
+        "spawner = SpawnActor(TrainerActor, cfg)\n",
+        "\n",
+        "# Spawn the actor\n",
+        "actor = await spawner.spawn()\n",
+        "print(f\"✓ Actor spawned: {actor}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Setup the Actor"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Setup (load data, checkpoints, etc.)\n",
+        "await spawner.setup()\n",
+        "print(\"✓ Actor setup complete\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Run Training"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Run training\n",
+        "await spawner.run()\n",
+        "print(\"✓ Training complete\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Cleanup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Cleanup resources\n",
+        "await spawner.cleanup()\n",
+        "print(\"✓ Cleanup complete\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "---\n",
+        "\n",
+        "# Quick Configuration Templates\n",
+        "\n",
+        "Here are ready-to-use templates for common scenarios!"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Template 1: Quick Test (Single GPU, Small Steps)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "quick_test_config = OmegaConf.create({\n",
+        "    \"comm\": {\"trace_buf_size\": 0},\n",
+        "    \"model\": {\n",
+        "        \"name\": \"llama3\",\n",
+        "        \"flavor\": \"8B\",\n",
+        "        \"hf_assets_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct\"\n",
+        "    },\n",
+        "    \"processes\": {\"procs\": 1, \"with_gpus\": True},\n",
+        "    \"optimizer\": {\"name\": \"AdamW\", \"lr\": 1e-5, \"eps\": 1e-8},\n",
+        "    \"lr_scheduler\": {\"warmup_steps\": 10},\n",
+        "    \"training\": {\n",
+        "        \"local_batch_size\": 1,\n",
+        "        \"seq_len\": 1024,\n",
+        "        \"max_norm\": 1.0,\n",
+        "        \"steps\": 100,  # Just 100 steps for quick testing\n",
+        "        \"compile\": False,\n",
+        "        \"dataset\": \"c4\"\n",
+        "    },\n",
+        "    \"parallelism\": {\n",
+        "        \"data_parallel_replicate_degree\": 1,\n",
+        "        \"data_parallel_shard_degree\": 1,\n",
+        "        \"tensor_parallel_degree\": 1,\n",
+        "        \"pipeline_parallel_degree\": 1,\n",
+        "        \"context_parallel_degree\": 1,\n",
+        "        \"expert_parallel_degree\": 1,\n",
+        "        \"disable_loss_parallel\": False\n",
+        "    },\n",
+        "    \"checkpoint\": {\n",
+        "        \"enable\": True,\n",
+        "        \"folder\": \"/tmp/quick_test_checkpoints\",\n",
+        "        \"initial_load_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct/\",\n",
+        "        \"initial_load_in_hf\": True,\n",
+        "        \"last_save_in_hf\": True,\n",
+        "        \"interval\": 50,\n",
+        "        \"async_mode\": \"disabled\"\n",
+        "    },\n",
+        "    \"activation_checkpoint\": {\n",
+        "        \"mode\": \"selective\",\n",
+        "        \"selective_ac_option\": \"op\"\n",
+        "    }\n",
+        "})\n",
+        "\n",
+        "print(\"Quick Test Configuration:\")\n",
+        "print(OmegaConf.to_yaml(quick_test_config))\n",
+        "\n",
+        "# To use: await run_actor(TrainerActor, quick_test_config)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Template 2: Multi-GPU Training (8 GPUs with FSDP)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "multi_gpu_config = OmegaConf.create({\n",
+        "    \"comm\": {\"trace_buf_size\": 0},\n",
+        "    \"model\": {\n",
+        "        \"name\": \"llama3\",\n",
+        "        \"flavor\": \"8B\",\n",
+        "        \"hf_assets_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct\"\n",
+        "    },\n",
+        "    \"processes\": {\"procs\": 8, \"with_gpus\": True},\n",
+        "    \"optimizer\": {\"name\": \"AdamW\", \"lr\": 2e-5, \"eps\": 1e-8},\n",
+        "    \"lr_scheduler\": {\"warmup_steps\": 200},\n",
+        "    \"training\": {\n",
+        "        \"local_batch_size\": 2,\n",
+        "        \"seq_len\": 2048,\n",
+        "        \"max_norm\": 1.0,\n",
+        "        \"steps\": 5000,\n",
+        "        \"compile\": False,\n",
+        "        \"dataset\": \"c4\"\n",
+        "    },\n",
+        "    \"parallelism\": {\n",
+        "        \"data_parallel_replicate_degree\": 1,\n",
+        "        \"data_parallel_shard_degree\": 8,  # FSDP across 8 GPUs\n",
+        "        \"tensor_parallel_degree\": 1,\n",
+        "        \"pipeline_parallel_degree\": 1,\n",
+        "        \"context_parallel_degree\": 1,\n",
+        "        \"expert_parallel_degree\": 1,\n",
+        "        \"disable_loss_parallel\": False\n",
+        "    },\n",
+        "    \"checkpoint\": {\n",
+        "        \"enable\": True,\n",
+        "        \"folder\": \"/tmp/multi_gpu_checkpoints\",\n",
+        "        \"initial_load_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct/\",\n",
+        "        \"initial_load_in_hf\": True,\n",
+        "        \"last_save_in_hf\": True,\n",
+        "        \"interval\": 500,\n",
+        "        \"async_mode\": \"disabled\"\n",
+        "    },\n",
+        "    \"activation_checkpoint\": {\n",
+        "        \"mode\": \"selective\",\n",
+        "        \"selective_ac_option\": \"op\"\n",
+        "    }\n",
+        "})\n",
+        "\n",
+        "print(\"Multi-GPU Configuration:\")\n",
+        "print(OmegaConf.to_yaml(multi_gpu_config))\n",
+        "\n",
+        "# To use: await run_actor(TrainerActor, multi_gpu_config)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Template 3: Memory-Efficient Training"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "memory_efficient_config = OmegaConf.create({\n",
+        "    \"comm\": {\"trace_buf_size\": 0},\n",
+        "    \"model\": {\n",
+        "        \"name\": \"llama3\",\n",
+        "        \"flavor\": \"8B\",\n",
+        "        \"hf_assets_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct\"\n",
+        "    },\n",
+        "    \"processes\": {\"procs\": 4, \"with_gpus\": True},\n",
+        "    \"optimizer\": {\"name\": \"AdamW\", \"lr\": 1e-5, \"eps\": 1e-8},\n",
+        "    \"lr_scheduler\": {\"warmup_steps\": 150},\n",
+        "    \"training\": {\n",
+        "        \"local_batch_size\": 1,  # Small batch size\n",
+        "        \"seq_len\": 1024,         # Shorter sequence\n",
+        "        \"max_norm\": 1.0,\n",
+        "        \"steps\": 2000,\n",
+        "        \"compile\": False,\n",
+        "        \"dataset\": \"c4\"\n",
+        "    },\n",
+        "    \"parallelism\": {\n",
+        "        \"data_parallel_replicate_degree\": 1,\n",
+        "        \"data_parallel_shard_degree\": 4,\n",
+        "        \"tensor_parallel_degree\": 1,\n",
+        "        \"pipeline_parallel_degree\": 1,\n",
+        "        \"context_parallel_degree\": 1,\n",
+        "        \"expert_parallel_degree\": 1,\n",
+        "        \"disable_loss_parallel\": False\n",
+        "    },\n",
+        "    \"checkpoint\": {\n",
+        "        \"enable\": True,\n",
+        "        \"folder\": \"/tmp/memory_efficient_checkpoints\",\n",
+        "        \"initial_load_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct/\",\n",
+        "        \"initial_load_in_hf\": True,\n",
+        "        \"last_save_in_hf\": True,\n",
+        "        \"interval\": 400,\n",
+        "        \"async_mode\": \"disabled\"\n",
+        "    },\n",
+        "    \"activation_checkpoint\": {\n",
+        "        \"mode\": \"selective\",  # Saves memory\n",
+        "        \"selective_ac_option\": \"op\"\n",
+        "    }\n",
+        "})\n",
+        "\n",
+        "print(\"Memory-Efficient Configuration:\")\n",
+        "print(OmegaConf.to_yaml(memory_efficient_config))\n",
+        "\n",
+        "# To use: await run_actor(TrainerActor, memory_efficient_config)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "---\n",
+        "\n",
+        "# Tips & Tricks\n",
+        "\n",
+        "## Memory Optimization\n",
+        "- ⬇️ Reduce `seq_len` if running out of memory\n",
+        "- ⬇️ Reduce `local_batch_size` if running out of memory\n",
+        "- ✅ Enable `activation_checkpoint` for memory savings\n",
+        "\n",
+        "## Training Speed\n",
+        "- ⬆️ Increase `local_batch_size` for faster training (if memory allows)\n",
+        "- 🚀 Use multiple GPUs with FSDP (`data_parallel_shard_degree > 1`)\n",
+        "- ⚡ Enable `compile: true` for PyTorch compilation (experimental)\n",
+        "\n",
+        "## Debugging\n",
+        "- 🧪 Start with small `steps` (e.g., 10-100) to test quickly\n",
+        "- 🔍 Use single GPU first (`procs: 1`)\n",
+        "- 📊 Monitor loss values in logs\n",
+        "\n",
+        "## Checkpoint Management\n",
+        "- 💾 Set `interval` based on how often you want to save\n",
+        "- 📁 Ensure `folder` path exists and has enough space\n",
+        "- 🔄 Use `initial_load_path` to resume from checkpoints"
+      ]
+    }
+  ],
+  "metadata": {
+    "orig_nbformat": 4
+  },
+  "nbformat": 4,
+  "nbformat_minor": 2
+}
diff --git a/apps/sft_v2/notebook_utils.py b/apps/sft_v2/notebook_utils.py
deleted file mode 100644
index b3636fd26..000000000
--- a/apps/sft_v2/notebook_utils.py
+++ /dev/null
@@ -1,463 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Utility functions for notebook-based SFT training.
-This module provides a clean API for interactive training in Jupyter notebooks.
-"""
-
-import asyncio
-import logging
-from typing import Any, Dict, Optional
-
-import torch
-
-from apps.sft_v2.main import ForgeSFTRecipe
-from omegaconf import DictConfig, OmegaConf
-
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
-
-
-# ============================================================================
-# Configuration Builders
-# ============================================================================
-
-
-def create_model_config(
-    name: str = "llama3",
-    flavor: str = "8B",
-    hf_assets_path: str = "/tmp/Meta-Llama-3.1-8B-Instruct",
-) -> Dict[str, Any]:
-    """
-    Create model configuration.
-
-    Args:
-        name: Model architecture name (e.g., 'llama3', 'llama2')
-        flavor: Model size (e.g., '8B', '70B')
-        hf_assets_path: Path to HuggingFace model assets
-
-    Returns:
-        Dictionary with model configuration
-    """
-    return {
-        "name": name,
-        "flavor": flavor,
-        "hf_assets_path": hf_assets_path,
-    }
-
-
-def create_optimizer_config(
-    name: str = "AdamW",
-    lr: float = 1e-5,
-    eps: float = 1e-8,
-    weight_decay: float = 0.0,
-    betas: tuple = (0.9, 0.999),
-) -> Dict[str, Any]:
-    """
-    Create optimizer configuration.
-
-    Args:
-        name: Optimizer name (e.g., 'AdamW', 'Adam', 'SGD')
-        lr: Learning rate
-        eps: Epsilon for numerical stability
-        weight_decay: L2 regularization coefficient
-        betas: Coefficients for computing running averages
-
-    Returns:
-        Dictionary with optimizer configuration
-    """
-    return {
-        "name": name,
-        "lr": lr,
-        "eps": eps,
-        "weight_decay": weight_decay,
-        "betas": list(betas),
-    }
-
-
-def create_lr_scheduler_config(
-    warmup_steps: int = 200,
-    decay_steps: Optional[int] = None,
-    min_lr: float = 0.0,
-) -> Dict[str, Any]:
-    """
-    Create learning rate scheduler configuration.
-
-    Args:
-        warmup_steps: Number of warmup steps
-        decay_steps: Number of decay steps (None = no decay)
-        min_lr: Minimum learning rate
-
-    Returns:
-        Dictionary with LR scheduler configuration
-    """
-    config = {"warmup_steps": warmup_steps}
-    if decay_steps is not None:
-        config["decay_steps"] = decay_steps
-    if min_lr > 0:
-        config["min_lr"] = min_lr
-    return config
-
-
-def create_training_config(
-    local_batch_size: int = 1,
-    seq_len: int = 2048,
-    max_norm: float = 1.0,
-    steps: int = 1000,
-    dataset: str = "c4",
-    compile: bool = False,
-) -> Dict[str, Any]:
-    """
-    Create training configuration.
-
-    Args:
-        local_batch_size: Batch size per GPU
-        seq_len: Sequence length
-        max_norm: Gradient clipping max norm
-        steps: Total training steps
-        dataset: Dataset name
-        compile: Whether to use torch.compile
-
-    Returns:
-        Dictionary with training configuration
-    """
-    return {
-        "local_batch_size": local_batch_size,
-        "seq_len": seq_len,
-        "max_norm": max_norm,
-        "steps": steps,
-        "dataset": dataset,
-        "compile": compile,
-    }
-
-
-def create_parallelism_config(
-    data_parallel_replicate_degree: int = 1,
-    data_parallel_shard_degree: int = -1,
-    tensor_parallel_degree: int = 1,
-    pipeline_parallel_degree: int = 1,
-    context_parallel_degree: int = 1,
-    expert_parallel_degree: int = 1,
-    disable_loss_parallel: bool = False,
-) -> Dict[str, Any]:
-    """
-    Create parallelism configuration.
-
-    Args:
-        data_parallel_replicate_degree: Data parallel replication
-        data_parallel_shard_degree: Data parallel sharding (FSDP), -1 = auto
-        tensor_parallel_degree: Tensor parallelism degree
-        pipeline_parallel_degree: Pipeline parallelism degree
-        context_parallel_degree: Context parallelism degree
-        expert_parallel_degree: Expert parallelism degree (for MoE)
-        disable_loss_parallel: Whether to disable loss parallelism
-
-    Returns:
-        Dictionary with parallelism configuration
-    """
-    return {
-        "data_parallel_replicate_degree": data_parallel_replicate_degree,
-        "data_parallel_shard_degree": data_parallel_shard_degree,
-        "tensor_parallel_degree": tensor_parallel_degree,
-        "pipeline_parallel_degree": pipeline_parallel_degree,
-        "context_parallel_degree": context_parallel_degree,
-        "expert_parallel_degree": expert_parallel_degree,
-        "disable_loss_parallel": disable_loss_parallel,
-    }
-
-
-def create_checkpoint_config(
-    enable: bool = True,
-    folder: str = "/tmp/checkpoints",
-    initial_load_path: Optional[str] = None,
-    initial_load_in_hf: bool = True,
-    last_save_in_hf: bool = True,
-    interval: int = 500,
-    async_mode: str = "disabled",
-) -> Dict[str, Any]:
-    """
-    Create checkpoint configuration.
-
-    Args:
-        enable: Whether to enable checkpointing
-        folder: Path to save checkpoints
-        initial_load_path: Path to load initial checkpoint from
-        initial_load_in_hf: Load initial checkpoint in HF format
-        last_save_in_hf: Save last checkpoint in HF format
-        interval: Steps between checkpoints
-        async_mode: Async checkpoint mode ('disabled', 'async', etc.)
-
-    Returns:
-        Dictionary with checkpoint configuration
-    """
-    return {
-        "enable": enable,
-        "folder": folder,
-        "initial_load_path": initial_load_path,
-        "initial_load_in_hf": initial_load_in_hf,
-        "last_save_in_hf": last_save_in_hf,
-        "interval": interval,
-        "async_mode": async_mode,
-    }
-
-
-def create_activation_checkpoint_config(
-    mode: str = "selective",
-    selective_ac_option: str = "op",
-) -> Dict[str, Any]:
-    """
-    Create activation checkpointing configuration.
-
-    Args:
-        mode: Activation checkpoint mode ('selective', 'full', 'none')
-        selective_ac_option: Selective AC option ('op', 'layer', etc.)
-
-    Returns:
-        Dictionary with activation checkpoint configuration
-    """
-    return {
-        "mode": mode,
-        "selective_ac_option": selective_ac_option,
-    }
-
-
-def create_process_config(
-    procs: int = 8,
-    with_gpus: bool = True,
-    hosts: Optional[int] = None,
-) -> Dict[str, Any]:
-    """
-    Create process configuration.
-
-    Args:
-        procs: Number of processes per host
-        with_gpus: Whether to use GPUs
-        hosts: Number of hosts (None = single node)
-
-    Returns:
-        Dictionary with process configuration
-    """
-    config = {
-        "procs": procs,
-        "with_gpus": with_gpus,
-    }
-    if hosts is not None:
-        config["hosts"] = hosts
-    return config
-
-
-# ============================================================================
-# Configuration Assembly
-# ============================================================================
-
-
-def build_config(
-    model_config: Dict[str, Any],
-    optimizer_config: Dict[str, Any],
-    lr_scheduler_config: Dict[str, Any],
-    training_config: Dict[str, Any],
-    parallelism_config: Dict[str, Any],
-    checkpoint_config: Dict[str, Any],
-    activation_checkpoint_config: Dict[str, Any],
-    process_config: Dict[str, Any],
-) -> DictConfig:
-    """
-    Build complete configuration from component configs.
-
-    Args:
-        model_config: Model configuration
-        optimizer_config: Optimizer configuration
-        lr_scheduler_config: LR scheduler configuration
-        training_config: Training configuration
-        parallelism_config: Parallelism configuration
-        checkpoint_config: Checkpoint configuration
-        activation_checkpoint_config: Activation checkpoint configuration
-        process_config: Process configuration
-
-    Returns:
-        Complete OmegaConf DictConfig
-    """
-    config = {
-        "comm": {"trace_buf_size": 0},
-        "model": model_config,
-        "optimizer": optimizer_config,
-        "lr_scheduler": lr_scheduler_config,
-        "training": training_config,
-        "parallelism": parallelism_config,
-        "checkpoint": checkpoint_config,
-        "activation_checkpoint": activation_checkpoint_config,
-        "processes": process_config,
-    }
-
-    return OmegaConf.create(config)
-
-
-# ============================================================================
-# Training Functions
-# ============================================================================
-
-
-async def create_recipe(config: DictConfig):
-    """
-    Create and return a ForgeSFTRecipe actor.
-
-    Args:
-        config: Complete configuration
-
-    Returns:
-        ForgeSFTRecipe actor instance
-    """
-    process_cfg = config.pop("processes")
-    recipe = await ForgeSFTRecipe.options(**process_cfg).as_actor(config)
-    logger.info("Recipe created successfully")
-    return recipe
-
-
-async def setup_recipe(recipe):
-    """
-    Setup the recipe (load model, initialize data loaders, etc.).
-
-    Args:
-        recipe: ForgeSFTRecipe actor instance
-    """
-    logger.info("Setting up recipe...")
-    await recipe.setup.call()
-    logger.info("Recipe setup complete")
-
-
-async def train_recipe(recipe):
-    """
-    Run training on the recipe.
-
-    Args:
-        recipe: ForgeSFTRecipe actor instance
-    """
-    logger.info("Starting training...")
-    await recipe.train.call()
-    logger.info("Training complete")
-
-
-async def cleanup_recipe(recipe):
-    """
-    Cleanup recipe resources.
-
-    Args:
-        recipe: ForgeSFTRecipe actor instance
-    """
-    logger.info("Cleaning up...")
-    await recipe.cleanup.call()
-    await recipe.mesh.stop()
-    logger.info("Cleanup complete")
-
-
-# ============================================================================
-# High-Level Training API
-# ============================================================================
-
-
-async def run_training(config: DictConfig):
-    """
-    Run complete training pipeline with the given configuration.
-
-    Args:
-        config: Complete configuration
-
-    Raises:
-        Exception: If training fails
-    """
-    # Create recipe
-    recipe = await create_recipe(config)
-
-    # Setup
-    await setup_recipe(recipe)
-
-    # Train
-    await train_recipe(recipe)
-
-    # Cleanup
-    await cleanup_recipe(recipe)
-
-
-def train(config: DictConfig):
-    """
-    Synchronous wrapper for run_training.
-
-    Args:
-        config: Complete configuration
-    """
-    asyncio.run(run_training(config))
-
-
-# ============================================================================
-# Display Utilities
-# ============================================================================
-
-
-def print_config(config: DictConfig, title: str = "Configuration"):
-    """
-    Pretty print configuration.
-
-    Args:
-        config: Configuration to print
-        title: Title for the output
-    """
-    print(f"\n{'='*60}")
-    print(f"{title:^60}")
-    print(f"{'='*60}")
-    print(OmegaConf.to_yaml(config))
-    print(f"{'='*60}\n")
-
-
-def summarize_config(config: DictConfig):
-    """
-    Print a summary of the configuration.
-
-    Args:
-        config: Configuration to summarize
-    """
-    print("\n" + "=" * 60)
-    print("Configuration Summary".center(60))
-    print("=" * 60)
-
-    print(f"\n📦 Model:")
-    print(f"  • Name: {config.model.name}")
-    print(f"  • Flavor: {config.model.flavor}")
-    print(f"  • Path: {config.model.hf_assets_path}")
-
-    print(f"\n⚙️  Training:")
-    print(f"  • Steps: {config.training.steps}")
-    print(f"  • Batch Size: {config.training.local_batch_size}")
-    print(f"  • Sequence Length: {config.training.seq_len}")
-    print(f"  • Dataset: {config.training.dataset}")
-
-    print(f"\n🔧 Optimizer:")
-    print(f"  • Name: {config.optimizer.name}")
-    print(f"  • Learning Rate: {config.optimizer.lr}")
-    print(f"  • Warmup Steps: {config.lr_scheduler.warmup_steps}")
-
-    print(f"\n🔀 Parallelism:")
-    print(
-        f"  • Data Parallel (Replicate): {config.parallelism.data_parallel_replicate_degree}"
-    )
-    print(
-        f"  • Data Parallel (Shard/FSDP): {config.parallelism.data_parallel_shard_degree}"
-    )
-    print(f"  • Tensor Parallel: {config.parallelism.tensor_parallel_degree}")
-    print(f"  • Pipeline Parallel: {config.parallelism.pipeline_parallel_degree}")
-
-    print(f"\n💾 Checkpointing:")
-    print(f"  • Enabled: {config.checkpoint.enable}")
-    print(f"  • Folder: {config.checkpoint.folder}")
-    print(f"  • Interval: {config.checkpoint.interval} steps")
-
-    print(f"\n🖥️  Resources:")
-    if "hosts" in config.processes:
-        print(f"  • Hosts: {config.processes.hosts}")
-    print(f"  • Processes per host: {config.processes.procs}")
-    print(f"  • GPUs: {config.processes.with_gpus}")
-
-    print("\n" + "=" * 60 + "\n")
diff --git a/apps/sft_v2/sft_training_notebook.ipynb b/apps/sft_v2/sft_training_notebook.ipynb
deleted file mode 100644
index 204ec15a9..000000000
--- a/apps/sft_v2/sft_training_notebook.ipynb
+++ /dev/null
@@ -1,568 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "# 🚀 SFT Training Notebook\n",
-        "\n",
-        "This notebook provides an interactive interface for training Language Models using Supervised Fine-Tuning (SFT).\n",
-        "\n",
-        "## Features\n",
-        "- ✅ Interactive configuration in separate cells\n",
-        "- ✅ Support for single-node and multi-node training\n",
-        "- ✅ Easy hyperparameter tuning\n",
-        "- ✅ Flexible parallelism strategies\n",
-        "- ✅ Checkpoint management\n",
-        "\n",
-        "## Quick Start\n",
-        "1. Configure each section (model, training, etc.)\n",
-        "2. Review the complete configuration\n",
-        "3. Run training!"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## 📚 Imports"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "import sys\n",
-        "sys.path.insert(0, '/home/hosseinkh/forge')\n",
-        "\n",
-        "from apps.sft_v2 import notebook_utils as nb\n",
-        "import torch\n",
-        "\n",
-        "print(f\"✅ Imports successful!\")\n",
-        "print(f\"📊 PyTorch version: {torch.__version__}\")\n",
-        "print(f\"🎮 CUDA available: {torch.cuda.is_available()}\")\n",
-        "if torch.cuda.is_available():\n",
-        "    print(f\"🔢 Number of GPUs: {torch.cuda.device_count()}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## 📦 Model Configuration\n",
-        "\n",
-        "Configure the model you want to train."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Model Configuration\n",
-        "model_config = nb.create_model_config(\n",
-        "    name=\"llama3\",\n",
-        "    flavor=\"8B\",\n",
-        "    hf_assets_path=\"/mnt/home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct\"\n",
-        ")\n",
-        "\n",
-        "print(\"📦 Model Configuration:\")\n",
-        "for key, value in model_config.items():\n",
-        "    print(f\"  • {key}: {value}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## ⚙️ Training Configuration\n",
-        "\n",
-        "Set training hyperparameters."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Training Configuration\n",
-        "training_config = nb.create_training_config(\n",
-        "    local_batch_size=1,      # Batch size per GPU\n",
-        "    seq_len=2048,            # Sequence length\n",
-        "    max_norm=1.0,            # Gradient clipping\n",
-        "    steps=1000,              # Total training steps\n",
-        "    dataset=\"c4\",            # Dataset name\n",
-        "    compile=False            # Use torch.compile?\n",
-        ")\n",
-        "\n",
-        "print(\"⚙️  Training Configuration:\")\n",
-        "for key, value in training_config.items():\n",
-        "    print(f\"  • {key}: {value}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## 🔧 Optimizer Configuration\n",
-        "\n",
-        "Configure the optimizer and learning rate."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Optimizer Configuration\n",
-        "optimizer_config = nb.create_optimizer_config(\n",
-        "    name=\"AdamW\",\n",
-        "    lr=1e-5,                 # Learning rate\n",
-        "    eps=1e-8,                # Epsilon\n",
-        "    weight_decay=0.0,        # Weight decay\n",
-        "    betas=(0.9, 0.999)       # Adam betas\n",
-        ")\n",
-        "\n",
-        "# LR Scheduler Configuration\n",
-        "lr_scheduler_config = nb.create_lr_scheduler_config(\n",
-        "    warmup_steps=200,        # Warmup steps\n",
-        "    decay_steps=None,        # Decay steps (None = no decay)\n",
-        "    min_lr=0.0               # Minimum LR\n",
-        ")\n",
-        "\n",
-        "print(\"🔧 Optimizer Configuration:\")\n",
-        "for key, value in optimizer_config.items():\n",
-        "    print(f\"  • {key}: {value}\")\n",
-        "\n",
-        "print(\"\\n📈 LR Scheduler Configuration:\")\n",
-        "for key, value in lr_scheduler_config.items():\n",
-        "    print(f\"  • {key}: {value}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## 🔀 Parallelism Configuration\n",
-        "\n",
-        "Configure distributed training strategies.\n",
-        "\n",
-        "### Parallelism Options:\n",
-        "- **Data Parallel (Replicate)**: Basic data parallelism\n",
-        "- **Data Parallel (Shard/FSDP)**: Fully Sharded Data Parallel (-1 = use all GPUs)\n",
-        "- **Tensor Parallel**: Split model across multiple GPUs\n",
-        "- **Pipeline Parallel**: Split model stages across GPUs"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Parallelism Configuration\n",
-        "parallelism_config = nb.create_parallelism_config(\n",
-        "    data_parallel_replicate_degree=1,   # DP replicate\n",
-        "    data_parallel_shard_degree=-1,      # FSDP (-1 = auto, uses all GPUs)\n",
-        "    tensor_parallel_degree=1,           # TP\n",
-        "    pipeline_parallel_degree=1,         # PP\n",
-        "    context_parallel_degree=1,          # CP\n",
-        "    expert_parallel_degree=1,           # EP (for MoE)\n",
-        "    disable_loss_parallel=False\n",
-        ")\n",
-        "\n",
-        "print(\"🔀 Parallelism Configuration:\")\n",
-        "for key, value in parallelism_config.items():\n",
-        "    print(f\"  • {key}: {value}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## 💾 Checkpoint Configuration\n",
-        "\n",
-        "Configure model checkpointing."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Checkpoint Configuration\n",
-        "checkpoint_config = nb.create_checkpoint_config(\n",
-        "    enable=True,\n",
-        "    folder=\"/mnt/home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct/saved_checkpoints\",\n",
-        "    initial_load_path=\"/mnt/home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct/\",\n",
-        "    initial_load_in_hf=True,\n",
-        "    last_save_in_hf=True,\n",
-        "    interval=500,            # Save every N steps\n",
-        "    async_mode=\"disabled\"\n",
-        ")\n",
-        "\n",
-        "# Activation Checkpoint Configuration (for memory efficiency)\n",
-        "activation_checkpoint_config = nb.create_activation_checkpoint_config(\n",
-        "    mode=\"selective\",        # 'selective', 'full', or 'none'\n",
-        "    selective_ac_option=\"op\" # 'op' or 'layer'\n",
-        ")\n",
-        "\n",
-        "print(\"💾 Checkpoint Configuration:\")\n",
-        "for key, value in checkpoint_config.items():\n",
-        "    print(f\"  • {key}: {value}\")\n",
-        "\n",
-        "print(\"\\n🔄 Activation Checkpoint Configuration:\")\n",
-        "for key, value in activation_checkpoint_config.items():\n",
-        "    print(f\"  • {key}: {value}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## 🖥️ Resource Configuration\n",
-        "\n",
-        "Configure compute resources.\n",
-        "\n",
-        "### Options:\n",
-        "- **Single Node**: Set only `procs` (number of GPUs)\n",
-        "- **Multi Node**: Set both `hosts` (number of nodes) and `procs` (GPUs per node)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Choose ONE of the following:\n",
-        "\n",
-        "# Option 1: Single Node (8 GPUs)\n",
-        "process_config = nb.create_process_config(\n",
-        "    procs=8,\n",
-        "    with_gpus=True,\n",
-        "    hosts=None  # None = single node\n",
-        ")\n",
-        "\n",
-        "# Option 2: Multi-Node (4 nodes × 8 GPUs = 32 total)\n",
-        "# Uncomment to use:\n",
-        "# process_config = nb.create_process_config(\n",
-        "#     procs=8,\n",
-        "#     with_gpus=True,\n",
-        "#     hosts=4\n",
-        "# )\n",
-        "\n",
-        "print(\"🖥️  Resource Configuration:\")\n",
-        "for key, value in process_config.items():\n",
-        "    print(f\"  • {key}: {value}\")\n",
-        "\n",
-        "if \"hosts\" in process_config and process_config[\"hosts\"]:\n",
-        "    total_gpus = process_config[\"hosts\"] * process_config[\"procs\"]\n",
-        "    print(f\"\\n📊 Total GPUs: {total_gpus}\")\n",
-        "else:\n",
-        "    print(f\"\\n📊 Total GPUs: {process_config['procs']}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## ☁️ Provisioner Configuration (Optional)\n",
-        "\n",
-        "**Only needed for multi-node training on SLURM clusters.**\n",
-        "\n",
-        "⚠️ Skip this cell if you're running single-node training!"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Provisioner Configuration (OPTIONAL - for multi-node only)\n",
-        "# Set to None for single-node training\n",
-        "\n",
-        "provisioner_config = None  # Default: no provisioner\n",
-        "\n",
-        "# Uncomment and configure for SLURM multi-node training:\n",
-        "# provisioner_config = nb.create_provisioner_config(\n",
-        "#     launcher=\"slurm\",\n",
-        "#     job_name=\"sft_training\",\n",
-        "#     partition=\"your_gpu_partition\",  # REQUIRED for SLURM\n",
-        "#     time=\"24:00:00\",                  # REQUIRED for SLURM\n",
-        "#     account=\"your_account\"            # May be required\n",
-        "# )\n",
-        "\n",
-        "if provisioner_config:\n",
-        "    print(\"☁️  Provisioner Configuration:\")\n",
-        "    for key, value in provisioner_config.items():\n",
-        "        print(f\"  • {key}: {value}\")\n",
-        "else:\n",
-        "    print(\"☁️  Provisioner: Disabled (single-node mode)\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## 🔨 Build Complete Configuration\n",
-        "\n",
-        "Combine all configurations into a single config object."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Build complete configuration\n",
-        "config = nb.build_config(\n",
-        "    model_config=model_config,\n",
-        "    optimizer_config=optimizer_config,\n",
-        "    lr_scheduler_config=lr_scheduler_config,\n",
-        "    training_config=training_config,\n",
-        "    parallelism_config=parallelism_config,\n",
-        "    checkpoint_config=checkpoint_config,\n",
-        "    activation_checkpoint_config=activation_checkpoint_config,\n",
-        "    process_config=process_config,\n",
-        "    provisioner_config=provisioner_config\n",
-        ")\n",
-        "\n",
-        "print(\"✅ Configuration built successfully!\\n\")\n",
-        "\n",
-        "# Display summary\n",
-        "nb.summarize_config(config)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## 📄 View Full Configuration (YAML)\n",
-        "\n",
-        "See the complete configuration in YAML format."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Print full configuration\n",
-        "nb.print_config(config, title=\"Complete Training Configuration\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## 💾 Save Configuration (Optional)\n",
-        "\n",
-        "Save the configuration to a YAML file for later use."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "from omegaconf import OmegaConf\n",
-        "\n",
-        "# Save configuration\n",
-        "config_path = \"/home/hosseinkh/forge/apps/sft_v2/my_training_config.yaml\"\n",
-        "with open(config_path, 'w') as f:\n",
-        "    OmegaConf.save(config, f)\n",
-        "\n",
-        "print(f\"✅ Configuration saved to: {config_path}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## 🚀 Run Training!\n",
-        "\n",
-        "Start the training process with the configured settings.\n",
-        "\n",
-        "⚠️ **Note**: This will start actual training and may take a long time!"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Run training\n",
-        "print(\"🚀 Starting training...\\n\")\n",
-        "\n",
-        "try:\n",
-        "    nb.train(config)\n",
-        "    print(\"\\n✅ Training completed successfully!\")\n",
-        "except Exception as e:\n",
-        "    print(f\"\\n❌ Training failed: {e}\")\n",
-        "    import traceback\n",
-        "    traceback.print_exc()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## 🔍 Advanced: Step-by-Step Execution\n",
-        "\n",
-        "For more control, you can run each training stage separately.\n",
-        "\n",
-        "⚠️ **Only run this section if you want manual control. Otherwise, use the cell above.**"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Step 1: Initialize provisioner (if configured)\n",
-        "import asyncio\n",
-        "\n",
-        "provisioner_initialized = await nb.initialize_provisioner(config)\n",
-        "print(f\"Provisioner initialized: {provisioner_initialized}\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Step 2: Create recipe\n",
-        "recipe = await nb.create_recipe(config)\n",
-        "print(\"Recipe created\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Step 3: Setup recipe (load model, data, etc.)\n",
-        "await nb.setup_recipe(recipe)\n",
-        "print(\"Recipe setup complete\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Step 4: Run training\n",
-        "await nb.train_recipe(recipe)\n",
-        "print(\"Training complete\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Step 5: Cleanup\n",
-        "await nb.cleanup_recipe(recipe)\n",
-        "print(\"Cleanup complete\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Step 6: Shutdown provisioner (if initialized)\n",
-        "if provisioner_initialized:\n",
-        "    await nb.shutdown_provisioner(config)\n",
-        "    print(\"Provisioner shutdown complete\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## 📊 Tips & Tricks\n",
-        "\n",
-        "### Memory Optimization\n",
-        "- Use **FSDP** (set `data_parallel_shard_degree=-1`) for large models\n",
-        "- Enable **activation checkpointing** (set `mode=\"selective\"` or `\"full\"`)\n",
-        "- Reduce **batch size** or **sequence length**\n",
-        "\n",
-        "### Speed Optimization\n",
-        "- Use **tensor parallelism** for large models (set `tensor_parallel_degree > 1`)\n",
-        "- Enable **compilation** (set `compile=True`)\n",
-        "- Increase **batch size** if memory allows\n",
-        "\n",
-        "### Multi-Node Training\n",
-        "- Set `hosts` in process config\n",
-        "- Configure provisioner with SLURM details\n",
-        "- Make sure model path is accessible on all nodes\n",
-        "\n",
-        "### Debugging\n",
-        "- Start with fewer steps (e.g., `steps=10`)\n",
-        "- Use single GPU first (`procs=1`)\n",
-        "- Check logs for errors"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## 🎯 Common Configurations\n",
-        "\n",
-        "### Quick Test Run\n",
-        "```python\n",
-        "training_config = nb.create_training_config(\n",
-        "    steps=10,\n",
-        "    local_batch_size=1\n",
-        ")\n",
-        "process_config = nb.create_process_config(procs=1)\n",
-        "```\n",
-        "\n",
-        "### Single Node, 8 GPUs, FSDP\n",
-        "```python\n",
-        "parallelism_config = nb.create_parallelism_config(\n",
-        "    data_parallel_shard_degree=-1  # Use all 8 GPUs with FSDP\n",
-        ")\n",
-        "process_config = nb.create_process_config(procs=8)\n",
-        "```\n",
-        "\n",
-        "### Multi-Node, 4×8 GPUs, TP=2\n",
-        "```python\n",
-        "parallelism_config = nb.create_parallelism_config(\n",
-        "    data_parallel_shard_degree=16,   # 32 GPUs / 2 TP = 16 FSDP\n",
-        "    tensor_parallel_degree=2\n",
-        ")\n",
-        "process_config = nb.create_process_config(procs=8, hosts=4)\n",
-        "provisioner_config = nb.create_provisioner_config(\n",
-        "    launcher=\"slurm\",\n",
-        "    partition=\"gpu_partition\"\n",
-        ")\n",
-        "```"
-      ]
-    }
-  ],
-  "metadata": {
-    "orig_nbformat": 4
-  },
-  "nbformat": 4,
-  "nbformat_minor": 2
-}
diff --git a/apps/sft_v2/spawn_actor.py b/apps/sft_v2/spawn_actor.py
new file mode 100644
index 000000000..eb9695c76
--- /dev/null
+++ b/apps/sft_v2/spawn_actor.py
@@ -0,0 +1,139 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+SpawnActor - Orchestrates the spawning and lifecycle management of actors.
+
+This module provides a high-level interface for creating, setting up, running,
+and cleaning up different types of actors (e.g., Trainer, Evaluator, etc.)
+"""
+
+import logging
+from typing import Any, Type
+
+from forge.apps.sft_v2.actor import BaseForgeActor
+from omegaconf import DictConfig
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class SpawnActor:
+    """
+    Orchestrator for spawning and managing actor lifecycles.
+
+    This class handles the creation, setup, execution, and cleanup of actors
+    in a standardized way.
+    """
+
+    def __init__(self, actor_class: Type[BaseForgeActor], config: DictConfig):
+        """
+        Initialize the spawn actor orchestrator.
+
+        Args:
+            actor_class: The actor class to instantiate (must inherit from BaseForgeActor)
+            config: Configuration dictionary for the actor
+        """
+        self.actor_class = actor_class
+        self.config = config
+        self.actor = None
+
+        if not issubclass(actor_class, BaseForgeActor):
+            raise TypeError(
+                f"actor_class must be a subclass of BaseForgeActor, got {actor_class}"
+            )
+
+    async def spawn(self) -> Any:
+        """
+        Spawn the actor instance with the given configuration.
+
+        Returns:
+            The spawned actor instance
+        """
+        logger.info(f"Spawning {self.actor_class.__name__}...")
+
+        process_cfg = self.config.pop("processes", {})
+
+        self.actor = await self.actor_class.options(**process_cfg).as_actor(self.config)
+
+        logger.info(f"{self.actor_class.__name__} spawned successfully.")
+        return self.actor
+
+    async def setup(self):
+        """
+        Setup the spawned actor (load data, checkpoints, etc.).
+        """
+        if self.actor is None:
+            raise RuntimeError(
+                "Actor must be spawned before setup. Call spawn() first."
+            )
+
+        logger.info(f"Setting up {self.actor_class.__name__}...")
+        await self.actor.setup.call()
+        logger.info(f"{self.actor_class.__name__} setup complete.")
+
+    async def run(self):
+        """
+        Run the main execution logic of the actor.
+        """
+        if self.actor is None:
+            raise RuntimeError(
+                "Actor must be spawned before running. Call spawn() first."
+            )
+
+        logger.info(f"Running {self.actor_class.__name__}...")
+        await self.actor.run.call()
+        logger.info(f"{self.actor_class.__name__} execution complete.")
+
+    async def cleanup(self):
+        """
+        Cleanup the actor resources and stop the mesh.
+        """
+        if self.actor is None:
+            raise RuntimeError(
+                "Actor must be spawned before cleanup. Call spawn() first."
+            )
+
+        logger.info(f"Cleaning up {self.actor_class.__name__}...")
+        await self.actor.cleanup.call()
+
+        if hasattr(self.actor, "mesh"):
+            await self.actor.mesh.stop()
+
+        logger.info(f"{self.actor_class.__name__} cleanup complete.")
+
+    async def run_full_lifecycle(self):
+        """
+        Execute the complete actor lifecycle: spawn -> setup -> run -> cleanup.
+
+        This is a convenience method that runs all phases in sequence.
+        """
+        logger.info(f"Starting full lifecycle for {self.actor_class.__name__}...")
+
+        try:
+            await self.spawn()
+            await self.setup()
+            await self.run()
+        finally:
+            if self.actor is not None:
+                await self.cleanup()
+
+        logger.info(f"Full lifecycle complete for {self.actor_class.__name__}.")
+
+
+async def run_actor(
+    actor_class: Type[BaseForgeActor],
+    config: DictConfig,
+) -> None:
+    """
+    Convenience function to run an actor with full lifecycle management.
+
+    Args:
+        actor_class: The actor class to instantiate
+        config: Configuration dictionary for the actor
+    """
+    spawner = SpawnActor(actor_class, config)
+    await spawner.run_full_lifecycle()
diff --git a/apps/sft_v2/trainer_actor.py b/apps/sft_v2/trainer_actor.py
new file mode 100644
index 000000000..10c5e9b38
--- /dev/null
+++ b/apps/sft_v2/trainer_actor.py
@@ -0,0 +1,189 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Trainer actor implementation for SFT training.
+
+This is a concrete implementation of BaseForgeActor for supervised fine-tuning.
+"""
+
+import logging
+
+import torch
+import torchtitan.experiments.forge.train_spec as forge_train_spec
+from forge.apps.sft_v2.actor import BaseForgeActor
+from forge.apps.sft_v2.utils import (
+    create_context_parallel_context,
+    log_training_step,
+    move_batch_to_device,
+    setup_sft_dataloader,
+    setup_tokenizer,
+)
+from monarch.actor import endpoint
+from omegaconf import DictConfig
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class TrainerActor(BaseForgeActor):
+    """
+    Concrete trainer actor for supervised fine-tuning.
+
+    Handles training loop, forward/backward passes, and checkpoint management.
+    """
+
+    train_spec: forge_train_spec.ForgeTrainSpec
+    train_dataloader: any
+    num_training_steps: int
+
+    def __init__(self, config: DictConfig):
+        """
+        Initialize the trainer actor.
+
+        Args:
+            config: Configuration dictionary containing training settings
+        """
+        super().__init__(config)
+        self.num_training_steps = self.job_config.training.steps
+
+    @endpoint
+    async def setup(self):
+        """
+        Setup the trainer (load data, checkpoint, etc.).
+        """
+        logger.info("Setting up trainer actor...")
+
+        self.tokenizer = setup_tokenizer(
+            hf_assets_path=self.job_config.model.hf_assets_path
+        )
+
+        self.train_dataloader = setup_sft_dataloader(
+            tokenizer=self.tokenizer,
+            dataset_path="yahma/alpaca-cleaned",
+            dataset_split="train",
+            target_tokens_per_pack=self.job_config.training.seq_len,
+            batch_size=self.job_config.training.local_batch_size,
+            device=self.device,
+        )
+
+        if self.checkpointer:
+            logger.info("Loading checkpoint...")
+            self.checkpointer.load(step=self.current_step)
+
+        logger.info("Trainer setup complete.")
+
+    def forward_backward(
+        self, input_dict: dict[str, torch.Tensor], labels: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Perform forward and backward pass.
+
+        Args:
+            input_dict: Dictionary containing input tokens
+            labels: Ground truth labels
+
+        Returns:
+            Computed loss value
+        """
+        model_parts = self.model_parts
+        parallel_dims = self.parallel_dims
+        inputs = input_dict["tokens"]
+
+        optional_context_parallel_ctx = create_context_parallel_context(
+            parallel_dims=parallel_dims,
+            inputs=inputs,
+            labels=labels,
+            model_parts=model_parts,
+            rotate_method=self.job_config.parallelism.context_parallel_rotate_method,
+        )
+
+        if parallel_dims.pp_enabled:
+            with self.train_context(optional_context_parallel_ctx):
+                targets, losses = (
+                    (labels, []) if self.pp_has_last_stage else (None, None)
+                )
+                if self.pp_has_first_stage:
+                    self.pp_schedule.step(
+                        inputs, target=targets, losses=losses, input_batch=inputs
+                    )
+                else:
+                    self.pp_schedule.step(
+                        target=targets, losses=losses, input_batch=inputs
+                    )
+
+            loss = (
+                torch.mean(torch.stack(losses)).to(self.device)
+                if self.pp_has_last_stage
+                else torch.tensor([-1.0], device=self.device)
+            )
+        else:
+            with self.train_context(optional_context_parallel_ctx):
+                assert len(model_parts) == 1
+                with self.maybe_enable_amp:
+                    pred = model_parts[0](inputs)
+                    loss = self.loss_fn(pred, labels)
+                del pred
+                loss.backward()
+
+        return loss
+
+    def train_step(self, batch: dict[str, torch.Tensor]) -> None:
+        """
+        Execute a single training step.
+
+        Args:
+            batch: Dictionary containing batch data (tokens, labels, etc.)
+        """
+        labels = batch.pop("labels")
+        loss = self.forward_backward(batch, labels)
+
+        log_training_step(self.current_step, self.num_training_steps, loss, logger)
+
+        self.optimizers.step()
+        self.lr_schedulers.step()
+
+    @endpoint
+    async def run(self) -> None:
+        """
+        Main training loop.
+        """
+        logger.info("Starting training loop...")
+
+        dataloader = iter(self.train_dataloader)
+        self.optimizers.zero_grad()
+
+        while self.current_step < self.num_training_steps:
+            batch = next(dataloader)
+            batch = move_batch_to_device(batch, self.device)
+
+            self.train_step(batch)
+            self.current_step += 1
+
+            if self.checkpointer:
+                self.checkpointer.save(
+                    curr_step=self.current_step,
+                    last_step=self.current_step == self.num_training_steps,
+                )
+
+        logger.info("Training complete!")
+
+    @endpoint
+    async def cleanup(self) -> None:
+        """
+        Cleanup resources (close checkpointer, logger, etc.).
+        """
+        logger.info("Cleaning up trainer actor...")
+
+        if self.checkpointer:
+            self.checkpointer.close()
+        if self.metric_logger:
+            self.metric_logger.close()
+
+        logger.info("Cleanup complete.")
+
+    def __repr__(self) -> str:
+        return "TrainerActor"
diff --git a/apps/sft_v2/utils.py b/apps/sft_v2/utils.py
new file mode 100644
index 000000000..6d0219805
--- /dev/null
+++ b/apps/sft_v2/utils.py
@@ -0,0 +1,187 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Utility functions for SFT training actors.
+
+These utilities handle data loading, model setup, and common operations.
+"""
+
+import logging
+import os
+from functools import partial
+from typing import Any, Optional
+
+import torch
+from forge.data.collate import collate_packed
+from forge.data.datasets.packed import PackedDataset, TextPacker
+from forge.data.datasets.sft_dataset import AlpacaToMessages, sft_iterable_dataset
+from forge.data.tokenizer import HuggingFaceModelTokenizer
+from torchdata.stateful_dataloader import StatefulDataLoader
+from torchtitan.distributed import ParallelDims, utils as dist_utils
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+def setup_tokenizer(
+    hf_assets_path: str,
+    tokenizer_filename: str = "tokenizer.json",
+    tokenizer_config_filename: str = "tokenizer_config.json",
+    generation_config_filename: str = "generation_config.json",
+) -> HuggingFaceModelTokenizer:
+    """
+    Setup HuggingFace tokenizer from model assets.
+
+    Args:
+        hf_assets_path: Path to the directory containing tokenizer files
+        tokenizer_filename: Name of the tokenizer JSON file
+        tokenizer_config_filename: Name of the tokenizer config JSON file
+        generation_config_filename: Name of the generation config JSON file
+
+    Returns:
+        Initialized HuggingFaceModelTokenizer
+    """
+    tokenizer_json_path = os.path.join(hf_assets_path, tokenizer_filename)
+    tokenizer_config_path = os.path.join(hf_assets_path, tokenizer_config_filename)
+    generation_config_path = os.path.join(hf_assets_path, generation_config_filename)
+
+    logger.info(f"Loading tokenizer from: {tokenizer_json_path}")
+
+    tokenizer = HuggingFaceModelTokenizer(
+        tokenizer_json_path=tokenizer_json_path,
+        tokenizer_config_json_path=tokenizer_config_path,
+        generation_config_path=generation_config_path,
+    )
+
+    return tokenizer
+
+
+def setup_sft_dataloader(
+    tokenizer: HuggingFaceModelTokenizer,
+    dataset_path: str,
+    dataset_split: str,
+    target_tokens_per_pack: int,
+    batch_size: int,
+    device: torch.device,
+    padding_idx: int = 0,
+    message_transform: Optional[Any] = None,
+) -> StatefulDataLoader:
+    """
+    Setup dataloader for SFT training.
+
+    Args:
+        tokenizer: Tokenizer to use for processing text
+        dataset_path: Path or name of the dataset (e.g., "yahma/alpaca-cleaned")
+        dataset_split: Dataset split to use (e.g., "train", "validation")
+        target_tokens_per_pack: Target sequence length for packing
+        batch_size: Batch size for training
+        device: Device to move tensors to
+        padding_idx: Padding token index
+        message_transform: Transform to convert dataset format to messages
+
+    Returns:
+        Configured StatefulDataLoader
+    """
+    if message_transform is None:
+        message_transform = AlpacaToMessages()
+
+    logger.info(f"Loading SFT dataset from: {dataset_path}, split: {dataset_split}")
+
+    dataset = sft_iterable_dataset(
+        model_transform=tokenizer,
+        message_transform=message_transform,
+        path=dataset_path,
+        split=dataset_split,
+    )
+
+    packer = TextPacker(padding_idx=padding_idx)
+    dataset = PackedDataset(
+        dataset=dataset,
+        packer=packer,
+        target_tokens_per_pack=target_tokens_per_pack,
+    )
+
+    dataloader = StatefulDataLoader(
+        dataset=dataset,
+        batch_size=batch_size,
+        collate_fn=partial(
+            collate_packed, mask_fn=packer.create_block_mask, device=device
+        ),
+    )
+
+    logger.info(
+        f"Created dataloader with batch_size={batch_size}, target_tokens={target_tokens_per_pack}"
+    )
+
+    return dataloader
+
+
+def create_context_parallel_context(
+    parallel_dims: ParallelDims,
+    inputs: torch.Tensor,
+    labels: torch.Tensor,
+    model_parts: list,
+    rotate_method: str,
+):
+    """
+    Create context parallel context for distributed training.
+
+    Args:
+        parallel_dims: Parallel dimensions configuration
+        inputs: Input tensor
+        labels: Label tensor
+        model_parts: List of model parts
+        rotate_method: Context parallel rotation method
+
+    Returns:
+        Context parallel context or None if CP is not enabled
+    """
+    if not parallel_dims.cp_enabled:
+        return None
+
+    return dist_utils.create_context_parallel_ctx(
+        cp_mesh=parallel_dims.world_mesh["cp"],
+        cp_buffers=[inputs, labels] + [m.freqs_cis for m in model_parts],
+        cp_seq_dims=[1, 1] + [0 for _ in model_parts],
+        cp_no_restore_buffers={inputs, labels},
+        cp_rotate_method=rotate_method,
+    )
+
+
+def move_batch_to_device(batch: dict[str, Any], device: torch.device) -> dict[str, Any]:
+    """
+    Move batch tensors to the specified device.
+
+    Args:
+        batch: Dictionary containing batch data
+        device: Target device
+
+    Returns:
+        Batch with tensors moved to device
+    """
+    for key, value in batch.items():
+        if isinstance(value, torch.Tensor):
+            batch[key] = value.to(device)
+    return batch
+
+
+def log_training_step(
+    step: int,
+    total_steps: int,
+    loss: torch.Tensor,
+    logger: logging.Logger,
+):
+    """
+    Log training step information.
+
+    Args:
+        step: Current training step
+        total_steps: Total number of training steps
+        loss: Current loss value
+        logger: Logger instance
+    """
+    logger.info(f"Step {step}/{total_steps} | Loss: {loss.item():.4f}")

From a0f62e72c6cd55aee47cfc50be6266b37a1d8e33 Mon Sep 17 00:00:00 2001
From: Hossein Kavianihamedani <hosseinkh@fb.com>
Date: Tue, 14 Oct 2025 11:05:04 -0700
Subject: [PATCH 3/7] Adding eval loop to the sft

---
 apps/sft/llama3_8b.yaml                       |   2 +
 apps/sft/main.py                              | 178 +++-
 apps/sft_v2/NOTEBOOK_GUIDE.md                 | 847 ------------------
 apps/sft_v2/actor.py                          | 133 ---
 apps/sft_v2/interactive_config_notebook.ipynb | 629 -------------
 apps/sft_v2/spawn_actor.py                    | 139 ---
 apps/sft_v2/trainer_actor.py                  | 189 ----
 apps/sft_v2/utils.py                          | 187 ----
 8 files changed, 139 insertions(+), 2165 deletions(-)
 delete mode 100644 apps/sft_v2/NOTEBOOK_GUIDE.md
 delete mode 100644 apps/sft_v2/actor.py
 delete mode 100644 apps/sft_v2/interactive_config_notebook.ipynb
 delete mode 100644 apps/sft_v2/spawn_actor.py
 delete mode 100644 apps/sft_v2/trainer_actor.py
 delete mode 100644 apps/sft_v2/utils.py

diff --git a/apps/sft/llama3_8b.yaml b/apps/sft/llama3_8b.yaml
index 43a690c1e..2fd563a6c 100644
--- a/apps/sft/llama3_8b.yaml
+++ b/apps/sft/llama3_8b.yaml
@@ -33,6 +33,8 @@ training:
   steps: 1000
   compile: false
   dataset: "c4"
+  #eval_interval: 500  # Setting eval_interval to run evaluation
+  #eval_steps: 100     # Number of validation batches during each evaluation run
 
 parallelism:
   data_parallel_replicate_degree: 1
diff --git a/apps/sft/main.py b/apps/sft/main.py
index 27a8036d4..97ed4125e 100644
--- a/apps/sft/main.py
+++ b/apps/sft/main.py
@@ -7,7 +7,6 @@
 """To run:
 
 python -m apps.sft.main --config apps/sft/llama3_8b.yaml
-
 """
 
 import asyncio
@@ -40,8 +39,6 @@
 from torchtitan.experiments.forge.engine import ForgeEngine
 from torchtitan.experiments.forge.job_config import ForgeJobConfig
 
-# from tqdm import tqdm
-
 # stubs for now
 Checkpointer = Any
 Dataloader = Any
@@ -64,7 +61,7 @@ class ForgeSFTRecipe(ForgeActor, ForgeEngine):
     checkpointer: Checkpointer
     tokenizer: Tokenizer
     train_dataloader: Dataloader
-    # val_dataloader: Dataloader
+    val_dataloader: Dataloader
     metric_logger: MetricLogger
     profiler: Profiler
     device: torch.device
@@ -81,6 +78,11 @@ def __init__(self, config: DictConfig):
         self.gradient_accumulation_steps = 1  # Example value, adjust as needed
         self._rank = current_rank().rank
         self._size = math.prod(current_size().values())
+
+        # Evaluation settings
+        self.eval_interval = job_config.training.get("eval_interval", float("inf"))
+        self.eval_steps = job_config.training.get("eval_steps", 0)
+
         self._init_dist()
         super().__init__(job_config)
 
@@ -111,25 +113,23 @@ def _init_dist(self):
 
     @endpoint
     async def setup(self):
-        self.train_dataloader = self.setup_data()
-        # self.train_dataloader = self.setup_data(
-        #     self.train_config.train_dataset_config,
-        #     self.train_config.train_dataloader_config,
-        #     self.train_config.packing_config,
-        # )
-        # self.val_dataloader = self.setup_data(
-        #     self.train_config.val_dataset_config,
-        #     self.train_config.val_dataloader_config,
-        #     self.train_config.packing_config,
-        # )
-
-        # TODO: confirm that this is working properly
-        # Should also use load, not dcp_load
+        # Setup training data (first 90% of train split)
+        self.train_dataloader = self.setup_data(
+            dataset_path="yahma/alpaca-cleaned", dataset_split="train[:90%]"
+        )
+
+        # Setup validation data (last 10% of train split)
+        self.val_dataloader = self.setup_data(
+            dataset_path="yahma/alpaca-cleaned", dataset_split="train[90%:]"
+        )
+
+        # Load checkpoint if resuming
         self.checkpointer.load(step=self.current_step)
-        # self.profiler = self.setup_profiler(self.train_config.profiler_config)
-        # self.logger = self.setup_logger(self.train_config.logger_config)
 
-    def setup_data(self):
+    def setup_data(
+        self, dataset_path: str = "yahma/alpaca-cleaned", dataset_split: str = "train"
+    ):
+        """Setup data with configurable dataset path and split."""
         print(os.path.join(self.job_config.model.hf_assets_path, "tokenizer.json"))
         tokenizer = HuggingFaceModelTokenizer(
             tokenizer_json_path=os.path.join(
@@ -146,8 +146,8 @@ def setup_data(self):
         dataset = sft_iterable_dataset(
             model_transform=tokenizer,
             message_transform=AlpacaToMessages(),
-            path="yahma/alpaca-cleaned",
-            split="train",
+            path=dataset_path,
+            split=dataset_split,
         )
         packer = TextPacker(padding_idx=0)
         dataset = PackedDataset(
@@ -163,10 +163,6 @@ def setup_data(self):
             ),
         )
 
-        # Ultimately we probably want something like this
-        # packer = build_packing_strategy(packing_config)
-        # dataset = build_dataset(dataset_config)
-        # dataloader = build_dataloader(dataloader_config, dataset, packer)
         return dataloader
 
     def forward_backward(
@@ -206,7 +202,6 @@ def forward_backward(
                     )
 
             # accumulate losses across pipeline microbatches
-            # TODO: PP+FSDP unexpectedly puts the loss back to the CPU
             loss = (
                 torch.mean(torch.stack(losses)).to(self.device)
                 if self.pp_has_last_stage
@@ -225,27 +220,125 @@ def forward_backward(
 
         return loss
 
+    def forward_only(
+        self, input_dict: dict[str, torch.Tensor], labels: torch.Tensor
+    ) -> torch.Tensor:
+        """Forward pass only for evaluation (no backward)."""
+        model_parts = self.model_parts
+        parallel_dims = self.parallel_dims
+
+        inputs = input_dict["tokens"]
+        optional_context_parallel_ctx = (
+            dist_utils.create_context_parallel_ctx(
+                cp_mesh=parallel_dims.world_mesh["cp"],
+                cp_buffers=[inputs, labels] + [m.freqs_cis for m in model_parts],
+                cp_seq_dims=[1, 1] + [0 for _ in model_parts],
+                cp_no_restore_buffers={inputs, labels},
+                cp_rotate_method=self.job_config.parallelism.context_parallel_rotate_method,
+            )
+            if parallel_dims.cp_enabled
+            else None
+        )
+
+        if parallel_dims.pp_enabled:
+            # Pipeline Parallel forward only
+            with self.train_context(optional_context_parallel_ctx):
+                targets, losses = (
+                    (labels, []) if self.pp_has_last_stage else (None, None)
+                )
+                if self.pp_has_first_stage:
+                    self.pp_schedule.step(
+                        inputs, target=targets, losses=losses, input_batch=inputs
+                    )
+                else:
+                    self.pp_schedule.step(
+                        target=targets, losses=losses, input_batch=inputs
+                    )
+
+            loss = (
+                torch.mean(torch.stack(losses)).to(self.device)
+                if self.pp_has_last_stage
+                else torch.tensor([-1.0], device=self.device)
+            )
+        else:
+            # Non-PP forward only
+            with self.train_context(optional_context_parallel_ctx):
+                assert len(model_parts) == 1
+                with self.maybe_enable_amp:
+                    pred = model_parts[0](inputs)
+                    loss = self.loss_fn(pred, labels)
+                del pred
+
+        return loss
+
     def train_step(self, batch) -> None:
-        # TODO
-        # with GradientAccumulation(
-        #     self.gradient_accumulation_steps,
-        #     self.model,
-        #     self.data_parallel_size,
-        # ) as grad_acc:
         labels = batch.pop("labels")
         loss = self.forward_backward(batch, labels)
 
         logger.info(f"{self.current_step} / {self.num_training_steps}|Loss: {loss}")
-        # self.pbar.set_description(f"{self.current_step}|Loss: {loss}")
-        # self.pbar.update(1)
         self.optimizers.step()
         self.lr_schedulers.step()
 
+    async def evaluate(self) -> dict[str, float]:
+        """Run evaluation on validation set (internal method, not an endpoint)."""
+        logger.info("=" * 50)
+        logger.info("STARTING EVALUATION ")
+        logger.info("=" * 50)
+
+        # Set model to eval mode
+        for model_part in self.model_parts:
+            model_part.eval()
+
+        val_dataloader = iter(self.val_dataloader)
+        total_loss = 0.0
+        num_batches = 0
+
+        with torch.no_grad():
+            for step in range(self.eval_steps):
+                try:
+                    batch = next(val_dataloader)
+
+                    # Move tensors to device
+                    for k, v in batch.items():
+                        if isinstance(v, torch.Tensor):
+                            batch[k] = v.to(self.device)
+
+                    labels = batch.pop("labels")
+                    loss = self.forward_only(batch, labels)
+
+                    total_loss += loss.item()
+                    num_batches += 1
+
+                    logger.info(
+                        f"  Eval batch {num_batches}/{self.eval_steps} | Loss: {loss.item():.4f}"
+                    )
+
+                except StopIteration:
+                    logger.warning("Reached end of validation dataloader early")
+                    break
+
+        # Set model back to train mode
+        for model_part in self.model_parts:
+            model_part.train()
+
+        avg_loss = total_loss / max(num_batches, 1)
+
+        metrics = {
+            "val_loss": avg_loss,
+            "val_batches": num_batches,
+        }
+
+        logger.info("-" * 50)
+        logger.info(f"EVALUATION COMPLETE")
+        logger.info(f"Validation Loss: {avg_loss:.4f}")
+        logger.info(f"Batches Evaluated: {num_batches}")
+        logger.info("=" * 50)
+        return metrics
+
     @endpoint
     async def train(self) -> None:
         dataloader = iter(self.train_dataloader)
         self.optimizers.zero_grad()
-
         # TODO: tqdm is broken in Monarch actors
         # self.pbar = tqdm(initial=self.current_step, total=self.num_training_steps)
 
@@ -254,18 +347,21 @@ async def train(self) -> None:
             # Move tensors to the appropriate device
             for k, v in batch.items():
                 if isinstance(v, torch.Tensor):
-                    batch[k] = v.to("cuda")  # TODO: hardcoded for now
+                    batch[k] = v.to(self.device)  # TODO: hardcoded for now
             self.train_step(batch)
-            # self.profiler.step()
             self.current_step += 1
 
+            # Run evaluation periodically
+            if self.current_step % self.eval_interval == 0:
+                eval_metrics = await self.evaluate()
+                logger.info(f"Step {self.current_step} | Eval metrics: {eval_metrics}")
+
+            # Save checkpoints
             self.checkpointer.save(
                 curr_step=self.current_step,
                 last_step=self.current_step == self.num_training_steps,
             )
 
-        # self.pbar.close()
-
     @endpoint
     async def cleanup(self) -> None:
         if self.checkpointer:
diff --git a/apps/sft_v2/NOTEBOOK_GUIDE.md b/apps/sft_v2/NOTEBOOK_GUIDE.md
deleted file mode 100644
index b3524ed31..000000000
--- a/apps/sft_v2/NOTEBOOK_GUIDE.md
+++ /dev/null
@@ -1,847 +0,0 @@
-# Complete Guide: Interactive Configuration Notebook
-
-This guide explains step-by-step how to use the interactive configuration notebook for SFT training.
-
----
-
-## 📖 Table of Contents
-
-1. [Overview](#overview)
-2. [Architecture Components](#architecture-components)
-3. [Notebook Step-by-Step](#notebook-step-by-step)
-4. [Utility Functions Explained](#utility-functions-explained)
-5. [How to Run](#how-to-run)
-6. [Common Scenarios](#common-scenarios)
-7. [Troubleshooting](#troubleshooting)
-
----
-
-## Overview
-
-The interactive configuration notebook (`interactive_config_notebook.ipynb`) allows you to:
-- Configure SFT training **without YAML files**
-- Define configuration interactively in separate cells
-- Easily modify parameters and experiment
-- Use pre-built templates for common scenarios
-
-### What Problem Does This Solve?
-
-**Before**: You had to edit YAML files, which required:
-- External file management
-- Reloading files after changes
-- Difficult to experiment with different configs
-
-**After**: You can:
-- Define everything in the notebook
-- Change values in cells and re-run
-- See all configurations clearly
-- No external file management needed
-
----
-
-## Architecture Components
-
-Before diving into the notebook, let's understand the components:
-
-### 1. BaseForgeActor (`actor.py`)
-
-**What it is**: An abstract base class that defines the contract for all actors.
-
-**What it does**:
-- Handles distributed initialization (sets up multi-GPU environment)
-- Manages common attributes (model, optimizer, checkpointer, etc.)
-- Defines three required methods that subclasses must implement:
-  - `setup()` - Initialize data, checkpoints, etc.
-  - `run()` - Main execution logic
-  - `cleanup()` - Resource cleanup
-
-**Why it matters**: Provides a consistent interface for different actor types (Trainer, Evaluator, Inferencer, etc.)
-
-### 2. TrainerActor (`trainer_actor.py`)
-
-**What it is**: A concrete implementation of BaseForgeActor for training.
-
-**What it does**:
-- Implements the training loop
-- Handles forward/backward passes
-- Manages checkpointing
-- Supports various parallelism strategies (FSDP, Pipeline Parallel, Tensor Parallel)
-
-**Key Methods**:
-- `setup()` - Loads tokenizer, dataset, and checkpoints
-- `run()` - Executes the training loop
-- `forward_backward()` - Performs forward and backward passes
-- `train_step()` - Single training step
-- `cleanup()` - Closes resources
-
-### 3. SpawnActor (`spawn_actor.py`)
-
-**What it is**: An orchestrator that manages actor lifecycle.
-
-**What it does**:
-- Creates actor instances
-- Manages the lifecycle: spawn → setup → run → cleanup
-- Provides error handling and cleanup guarantees
-
-**Key Methods**:
-- `spawn()` - Creates the actor instance
-- `setup()` - Calls actor's setup
-- `run()` - Calls actor's run
-- `cleanup()` - Calls actor's cleanup and stops the mesh
-- `run_full_lifecycle()` - Executes all phases automatically
-
-**Why it matters**: Simplifies actor management and ensures proper resource cleanup.
-
-### 4. Utility Functions (`utils.py`)
-
-Helper functions for common operations. See [Utility Functions Explained](#utility-functions-explained) section below.
-
----
-
-## Notebook Step-by-Step
-
-### Step 1: Import Dependencies
-
-```python
-import asyncio
-import logging
-from omegaconf import OmegaConf, DictConfig
-
-from forge.apps.sft_v2.trainer_actor import TrainerActor
-from forge.apps.sft_v2.spawn_actor import SpawnActor, run_actor
-```
-
-**What this does**:
-- `asyncio` - For async/await operations (actors run asynchronously)
-- `logging` - For logging training progress
-- `OmegaConf` - For managing configurations (converts dicts to config objects)
-- `TrainerActor` - The training actor we'll use
-- `SpawnActor`, `run_actor` - For managing actor lifecycle
-
-**Why we need it**: These are the core dependencies for running the actor-based training.
-
----
-
-### Step 2: Configure Model Settings
-
-```python
-model_config = {
-    "name": "llama3",
-    "flavor": "8B",
-    "hf_assets_path": "/tmp/Meta-Llama-3.1-8B-Instruct"
-}
-```
-
-**What this does**:
-- `name` - Model architecture type (e.g., "llama3", "llama2")
-- `flavor` - Model size (e.g., "8B", "70B", "405B")
-- `hf_assets_path` - Path to the model files (tokenizer, weights, config)
-
-**How to modify**:
-- Change `flavor` to use different model sizes
-- Update `hf_assets_path` to point to your model location
-- Make sure the path contains `tokenizer.json`, `tokenizer_config.json`, and model weights
-
-**Example variations**:
-```python
-# For a 70B model
-model_config = {
-    "name": "llama3",
-    "flavor": "70B",
-    "hf_assets_path": "/path/to/Meta-Llama-3.1-70B"
-}
-```
-
----
-
-### Step 3: Configure Process Settings
-
-```python
-processes_config = {
-    "procs": 8,        # Number of processes
-    "with_gpus": True  # Use GPUs
-}
-```
-
-**What this does**:
-- `procs` - Number of parallel processes (usually = number of GPUs)
-- `with_gpus` - Whether to use GPUs or CPUs
-
-**How to modify**:
-- For single GPU: `"procs": 1`
-- For 4 GPUs: `"procs": 4`
-- For CPU training: `"with_gpus": False` (not recommended for LLMs)
-
-**Important**: Set `procs` to match your available GPUs!
-
----
-
-### Step 4: Configure Optimizer Settings
-
-```python
-optimizer_config = {
-    "name": "AdamW",
-    "lr": 1e-5,    # Learning rate
-    "eps": 1e-8
-}
-```
-
-**What this does**:
-- `name` - Optimizer type (AdamW is recommended for LLMs)
-- `lr` - Learning rate (how fast the model learns)
-- `eps` - Epsilon for numerical stability
-
-**How to modify**:
-- **Lower learning rate** (e.g., `1e-6`) for fine-tuning
-- **Higher learning rate** (e.g., `5e-5`) for pre-training (use with caution)
-- Typical range for fine-tuning: `1e-6` to `1e-4`
-
-**Tips**:
-- Start conservative with `1e-5` or `2e-5`
-- If loss explodes, reduce learning rate
-- If training is too slow, slightly increase learning rate
-
----
-
-### Step 5: Configure Learning Rate Scheduler
-
-```python
-lr_scheduler_config = {
-    "warmup_steps": 200  # Number of warmup steps
-}
-```
-
-**What this does**:
-- `warmup_steps` - Number of steps to gradually increase learning rate from 0 to `lr`
-
-**Why warmup**: Prevents training instability at the beginning by starting with a low learning rate.
-
-**How to modify**:
-- For short training (< 1000 steps): use 10-50 warmup steps
-- For medium training (1000-5000 steps): use 100-200 warmup steps
-- For long training (> 5000 steps): use 200-500 warmup steps
-- Rule of thumb: ~5-10% of total training steps
-
----
-
-### Step 6: Configure Training Settings
-
-```python
-training_config = {
-    "local_batch_size": 1,  # Batch size per GPU
-    "seq_len": 2048,         # Sequence length
-    "max_norm": 1.0,         # Gradient clipping
-    "steps": 1000,           # Total training steps
-    "compile": False,        # PyTorch compilation
-    "dataset": "c4"          # Dataset name
-}
-```
-
-**What this does**:
-- `local_batch_size` - Number of samples per GPU per step
-- `seq_len` - Maximum sequence length (in tokens)
-- `max_norm` - Gradient clipping threshold (prevents exploding gradients)
-- `steps` - Total number of training steps
-- `compile` - Enable PyTorch 2.0 compilation (experimental)
-- `dataset` - Dataset identifier
-
-**How to modify**:
-
-**For Memory Issues**:
-- Reduce `seq_len` (e.g., from 2048 to 1024)
-- Reduce `local_batch_size` (e.g., from 2 to 1)
-- Both reduce memory usage
-
-**For Faster Training**:
-- Increase `local_batch_size` if you have memory
-- Use shorter `seq_len` for tasks that don't need long context
-
-**For Quick Testing**:
-- Set `steps` to 10-100 for quick validation
-
-**Global batch size** = `local_batch_size` × `procs` × `data_parallel_shard_degree`
-
----
-
-### Step 7: Configure Parallelism Settings
-
-```python
-parallelism_config = {
-    "data_parallel_replicate_degree": 1,
-    "data_parallel_shard_degree": -1,  # -1 = use all GPUs for FSDP
-    "tensor_parallel_degree": 1,
-    "pipeline_parallel_degree": 1,
-    "context_parallel_degree": 1,
-    "expert_parallel_degree": 1,
-    "disable_loss_parallel": False
-}
-```
-
-**What this does**:
-
-- **Data Parallel Shard Degree (FSDP)**: Splits model parameters across GPUs
-  - `-1` means use all available GPUs
-  - `8` means split across 8 GPUs
-  - Most common strategy for fine-tuning
-
-- **Tensor Parallel Degree**: Splits individual layers across GPUs
-  - Use for very large models that don't fit on single GPU even with FSDP
-  - `1` means no tensor parallelism
-
-- **Pipeline Parallel Degree**: Splits model into sequential stages
-  - Use for extremely large models
-  - `1` means no pipeline parallelism
-
-- **Context Parallel Degree**: Splits sequence dimension
-  - For very long sequences
-  - `1` means no context parallelism
-
-**Common Configurations**:
-
-**Single GPU**:
-```python
-"data_parallel_shard_degree": 1
-```
-
-**8 GPUs with FSDP (recommended)**:
-```python
-"data_parallel_shard_degree": -1  # or 8
-```
-
-**Large Model (70B+) with Tensor Parallelism**:
-```python
-"data_parallel_shard_degree": 4,
-"tensor_parallel_degree": 2
-```
-
----
-
-### Step 8: Configure Checkpoint Settings
-
-```python
-checkpoint_config = {
-    "enable": True,
-    "folder": "/tmp/Meta-Llama-3.1-8B-Instruct/saved_checkpoints",
-    "initial_load_path": "/tmp/Meta-Llama-3.1-8B-Instruct/",
-    "initial_load_in_hf": True,
-    "last_save_in_hf": True,
-    "interval": 500,           # Save every N steps
-    "async_mode": "disabled"
-}
-```
-
-**What this does**:
-- `enable` - Whether to enable checkpointing
-- `folder` - Where to save checkpoints
-- `initial_load_path` - Where to load initial weights from
-- `initial_load_in_hf` - Load weights in HuggingFace format
-- `last_save_in_hf` - Save final checkpoint in HuggingFace format
-- `interval` - How often to save (in steps)
-- `async_mode` - Async saving mode (use "disabled" for simplicity)
-
-**How to modify**:
-- **Save more frequently**: Reduce `interval` (e.g., 100)
-- **Save less frequently**: Increase `interval` (e.g., 1000)
-- **Resume training**: Point `initial_load_path` to your checkpoint folder
-
-**Important**: Make sure `folder` path exists and has enough disk space!
-
----
-
-### Step 9: Configure Activation Checkpointing
-
-```python
-activation_checkpoint_config = {
-    "mode": "selective",
-    "selective_ac_option": "op"
-}
-```
-
-**What this does**:
-- Saves memory by recomputing activations during backward pass instead of storing them
-- `mode` - Checkpointing mode ("selective" or "full")
-- `selective_ac_option` - Which operations to checkpoint
-
-**Memory vs Speed Trade-off**:
-- **Activation checkpointing ON**: Lower memory, slower training
-- **Activation checkpointing OFF**: Higher memory, faster training
-
-**When to use**: Enable when running out of memory.
-
----
-
-### Step 10: Configure Communication Settings
-
-```python
-comm_config = {
-    "trace_buf_size": 0
-}
-```
-
-**What this does**:
-- Configuration for distributed communication (required by TorchTitan)
-- Usually you don't need to modify this
-
----
-
-### Step 11: Combine All Configurations
-
-```python
-complete_config = {
-    "comm": comm_config,
-    "model": model_config,
-    "processes": processes_config,
-    "optimizer": optimizer_config,
-    "lr_scheduler": lr_scheduler_config,
-    "training": training_config,
-    "parallelism": parallelism_config,
-    "checkpoint": checkpoint_config,
-    "activation_checkpoint": activation_checkpoint_config
-}
-
-cfg = OmegaConf.create(complete_config)
-```
-
-**What this does**:
-- Combines all configuration sections into one complete config
-- Converts to OmegaConf format (allows dot notation access)
-
-**Prints**: The complete configuration in YAML format for review
-
----
-
-### Step 12: Run Training (Simple Way)
-
-```python
-await run_actor(TrainerActor, cfg)
-```
-
-**What this does**:
-- Spawns the trainer actor
-- Runs setup (loads data, model, checkpoints)
-- Runs training loop
-- Cleans up resources
-- All in one line!
-
-**When to use**: When you want fully automatic training with no manual intervention.
-
----
-
-### Alternative: Manual Lifecycle Control
-
-For more control over the training process:
-
-#### Create and Spawn the Actor
-
-```python
-spawner = SpawnActor(TrainerActor, cfg)
-actor = await spawner.spawn()
-```
-
-**What this does**:
-- Creates a spawner with your config
-- Spawns the actor instance (allocates resources, initializes distributed environment)
-
-#### Setup the Actor
-
-```python
-await spawner.setup()
-```
-
-**What this does**:
-- Loads tokenizer from `hf_assets_path`
-- Loads training dataset
-- Initializes model
-- Loads checkpoint if specified
-
-**At this point**: You could inspect the actor state before training:
-```python
-print(f"Current step: {actor.current_step}")
-print(f"Device: {actor.device}")
-```
-
-#### Run Training
-
-```python
-await spawner.run()
-```
-
-**What this does**:
-- Executes the training loop
-- Iterates through batches
-- Performs forward/backward passes
-- Updates weights
-- Saves checkpoints at intervals
-
-#### Cleanup
-
-```python
-await spawner.cleanup()
-```
-
-**What this does**:
-- Closes checkpointer
-- Closes logger
-- Stops the actor mesh
-- Frees resources
-
-**When to use manual control**:
-- When you want to inspect state between phases
-- When you want to modify configuration between setup and run
-- For debugging purposes
-
----
-
-## Utility Functions Explained
-
-The `utils.py` module provides reusable helper functions:
-
-### 1. `setup_tokenizer()`
-
-```python
-def setup_tokenizer(
-    hf_assets_path: str,
-    tokenizer_filename: str = "tokenizer.json",
-    tokenizer_config_filename: str = "tokenizer_config.json",
-    generation_config_filename: str = "generation_config.json",
-) -> HuggingFaceModelTokenizer
-```
-
-**What it does**:
-- Loads a HuggingFace tokenizer from the model assets directory
-- Initializes tokenizer with config and generation settings
-
-**Parameters**:
-- `hf_assets_path` - Path to directory containing tokenizer files
-- Other parameters are filenames (usually don't need to change)
-
-**Returns**: Initialized `HuggingFaceModelTokenizer` object
-
-**Example**:
-```python
-tokenizer = setup_tokenizer("/tmp/Meta-Llama-3.1-8B-Instruct")
-```
-
-**When to use**: If you need to use the tokenizer independently (e.g., for preprocessing data)
-
----
-
-### 2. `setup_sft_dataloader()`
-
-```python
-def setup_sft_dataloader(
-    tokenizer: HuggingFaceModelTokenizer,
-    dataset_path: str,
-    dataset_split: str,
-    target_tokens_per_pack: int,
-    batch_size: int,
-    device: torch.device,
-    padding_idx: int = 0,
-    message_transform: Optional[Any] = None,
-) -> StatefulDataLoader
-```
-
-**What it does**:
-- Creates a dataloader for supervised fine-tuning
-- Handles data loading, tokenization, and packing
-- Returns a StatefulDataLoader (can save/restore state for checkpointing)
-
-**Parameters**:
-- `tokenizer` - Tokenizer to use for text processing
-- `dataset_path` - HuggingFace dataset name (e.g., "yahma/alpaca-cleaned")
-- `dataset_split` - Which split to use ("train", "validation", "test")
-- `target_tokens_per_pack` - Sequence length (same as `seq_len` in config)
-- `batch_size` - Batch size (same as `local_batch_size` in config)
-- `device` - Which device to move tensors to
-- `padding_idx` - Token ID for padding (usually 0)
-- `message_transform` - Transform to convert dataset format (default: AlpacaToMessages)
-
-**Returns**: Configured `StatefulDataLoader`
-
-**Example**:
-```python
-dataloader = setup_sft_dataloader(
-    tokenizer=tokenizer,
-    dataset_path="yahma/alpaca-cleaned",
-    dataset_split="train",
-    target_tokens_per_pack=2048,
-    batch_size=4,
-    device=torch.device("cuda"),
-)
-```
-
-**When to use**: If you want to create a custom dataloader outside of TrainerActor
-
----
-
-### 3. `create_context_parallel_context()`
-
-```python
-def create_context_parallel_context(
-    parallel_dims: ParallelDims,
-    inputs: torch.Tensor,
-    labels: torch.Tensor,
-    model_parts: list,
-    rotate_method: str,
-)
-```
-
-**What it does**:
-- Creates context for context parallelism (splits sequence across GPUs)
-- Returns None if context parallelism is disabled
-
-**Parameters**:
-- `parallel_dims` - Parallel dimensions configuration
-- `inputs` - Input tensor
-- `labels` - Label tensor
-- `model_parts` - List of model parts
-- `rotate_method` - Rotation method for context parallel
-
-**Returns**: Context parallel context or None
-
-**When to use**: Internally used by TrainerActor. You rarely need to call this directly.
-
----
-
-### 4. `move_batch_to_device()`
-
-```python
-def move_batch_to_device(batch: dict[str, Any], device: torch.device) -> dict[str, Any]
-```
-
-**What it does**:
-- Moves all tensors in a batch dictionary to the specified device
-- Leaves non-tensor values unchanged
-
-**Parameters**:
-- `batch` - Dictionary containing batch data
-- `device` - Target device (e.g., `torch.device("cuda")`)
-
-**Returns**: Batch with tensors moved to device
-
-**Example**:
-```python
-batch = {"tokens": tensor, "labels": tensor, "metadata": "some_string"}
-batch = move_batch_to_device(batch, torch.device("cuda"))
-```
-
-**When to use**: Useful when manually processing batches
-
----
-
-### 5. `log_training_step()`
-
-```python
-def log_training_step(
-    step: int,
-    total_steps: int,
-    loss: torch.Tensor,
-    logger: logging.Logger,
-)
-```
-
-**What it does**:
-- Logs training progress in a formatted way
-- Shows current step, total steps, and loss value
-
-**Parameters**:
-- `step` - Current training step
-- `total_steps` - Total number of training steps
-- `loss` - Current loss tensor
-- `logger` - Logger instance
-
-**Example output**:
-```
-Step 100/1000 | Loss: 2.3456
-```
-
-**When to use**: Internally used by TrainerActor. You can use it for custom logging.
-
----
-
-## How to Run
-
-### Prerequisites
-
-1. **Download Model**:
-```bash
-export HF_HUB_DISABLE_XET=1
-forge download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir /tmp/Meta-Llama-3.1-8B-Instruct
-```
-
-2. **Check GPU Availability**:
-```bash
-nvidia-smi  # Should show your GPUs
-```
-
-### Running the Notebook
-
-#### Option 1: Using Jupyter Notebook
-
-1. **Start Jupyter**:
-```bash
-cd /home/hosseinkh/TorchForge/forge
-jupyter notebook
-```
-
-2. **Open the notebook**:
-   - Navigate to `apps/sft_v2/interactive_config_notebook.ipynb`
-   - Click to open
-
-3. **Run cells sequentially**:
-   - Click on first cell, press `Shift + Enter` to run
-   - Continue through all cells
-   - Modify configuration cells as needed
-   - Run Step 12 to start training
-
-#### Option 2: Using VS Code
-
-1. **Open notebook in VS Code**:
-   - File → Open → `interactive_config_notebook.ipynb`
-
-2. **Select Python kernel**:
-   - Click "Select Kernel" in top right
-   - Choose your Python environment
-
-3. **Run cells**:
-   - Click "Run Cell" button on each cell
-   - Or press `Shift + Enter`
-
-#### Option 3: Using Command Line (with simplified entry point)
-
-```bash
-cd /home/hosseinkh/TorchForge/forge
-python -m apps.sft_v2.notebook_main --config apps/sft_v2/llama3_8b.yaml
-```
-
-Note: This uses a YAML file, but you can use the notebook for interactive config.
-
----
-
-## Common Scenarios
-
-### Scenario 1: Quick Test (1 GPU, 100 steps)
-
-```python
-# Modify these cells:
-processes_config = {"procs": 1, "with_gpus": True}
-training_config = {
-    "local_batch_size": 1,
-    "seq_len": 1024,
-    "steps": 100,  # Just 100 steps
-    ...
-}
-```
-
-**Expected time**: 5-10 minutes on A100
-
-### Scenario 2: Full Training (8 GPUs, 5000 steps)
-
-```python
-processes_config = {"procs": 8, "with_gpus": True}
-training_config = {
-    "local_batch_size": 2,
-    "seq_len": 2048,
-    "steps": 5000,
-    ...
-}
-parallelism_config = {
-    "data_parallel_shard_degree": -1,  # Use all 8 GPUs
-    ...
-}
-```
-
-**Expected time**: Several hours depending on hardware
-
-### Scenario 3: Memory-Constrained Training
-
-```python
-training_config = {
-    "local_batch_size": 1,  # Small batch
-    "seq_len": 1024,         # Shorter sequence
-    ...
-}
-activation_checkpoint_config = {
-    "mode": "selective",  # Enable AC for memory savings
-    ...
-}
-```
-
-**Use when**: Running out of GPU memory
-
-### Scenario 4: Resume from Checkpoint
-
-```python
-checkpoint_config = {
-    "enable": True,
-    "folder": "/path/to/previous/checkpoints",
-    "initial_load_path": "/path/to/previous/checkpoints/step_1000",
-    "interval": 500,
-    ...
-}
-```
-
-**Use when**: Continuing training from a saved checkpoint
-
----
-
-## Troubleshooting
-
-### Problem: "CUDA out of memory"
-
-**Solutions**:
-1. Reduce `seq_len` (e.g., from 2048 to 1024)
-2. Reduce `local_batch_size` (e.g., from 2 to 1)
-3. Enable activation checkpointing
-4. Use more GPUs with FSDP
-
-### Problem: "Loss is NaN or exploding"
-
-**Solutions**:
-1. Reduce learning rate (e.g., from `1e-5` to `1e-6`)
-2. Increase gradient clipping (`max_norm` from 1.0 to 0.5)
-3. Increase warmup steps
-
-### Problem: "Training is too slow"
-
-**Solutions**:
-1. Increase `local_batch_size` if memory allows
-2. Use more GPUs
-3. Reduce `seq_len` if your task doesn't need long context
-4. Enable compilation (`compile: True`)
-
-### Problem: "Cannot find tokenizer files"
-
-**Solutions**:
-1. Check `hf_assets_path` is correct
-2. Ensure path contains `tokenizer.json` and `tokenizer_config.json`
-3. Re-download model if files are missing
-
-### Problem: "Actor spawning fails"
-
-**Solutions**:
-1. Check you have enough GPUs for `procs`
-2. Verify CUDA is available (`torch.cuda.is_available()`)
-3. Check no other processes are using GPUs
-
----
-
-## Summary
-
-**Key Takeaways**:
-
-1. **Interactive Configuration**: Define all settings in notebook cells, no YAML needed
-2. **Step-by-Step**: Configure model, processes, optimizer, training, parallelism, checkpoints separately
-3. **Two Ways to Run**: Simple (`run_actor()`) or manual (lifecycle control)
-4. **Utility Functions**: Helper functions for tokenization, data loading, device management
-5. **Templates Provided**: Quick test, multi-GPU, memory-efficient configs ready to use
-6. **Flexible**: Easy to modify parameters and experiment
-
-**Next Steps**:
-1. Download your model
-2. Open the notebook
-3. Modify configuration cells for your needs
-4. Run Step 12 to start training
-5. Monitor logs for progress
-
-Happy Training! 🚀
diff --git a/apps/sft_v2/actor.py b/apps/sft_v2/actor.py
deleted file mode 100644
index 8607a39c4..000000000
--- a/apps/sft_v2/actor.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Abstract Actor class for training/inference actors in Forge.
-
-This provides a base class that can be extended for different types of actors
-(e.g., Trainer, Evaluator, Inferencer, etc.)
-"""
-
-import logging
-import math
-import os
-from abc import ABC, abstractmethod
-from typing import Any, Optional
-
-import torch
-from forge.controller import ForgeActor
-from monarch.actor import current_rank, current_size
-from omegaconf import DictConfig, OmegaConf
-from torch import nn
-from torchtitan.components.loss import LossFunction
-from torchtitan.components.lr_scheduler import LRSchedulersContainer
-from torchtitan.components.optimizer import OptimizersContainer
-from torchtitan.distributed import ParallelDims
-from torchtitan.experiments.forge.engine import ForgeEngine
-from torchtitan.experiments.forge.job_config import ForgeJobConfig
-
-Checkpointer = Any
-Dataloader = Any
-MetricLogger = Any
-Profiler = Any
-Tokenizer = Any
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-
-class BaseForgeActor(ForgeActor, ForgeEngine, ABC):
-    """
-    Abstract base class for Forge actors.
-
-    This class handles common initialization, distributed setup, and provides
-    abstract methods that must be implemented by concrete actor classes.
-    """
-
-    job_config: ForgeJobConfig
-    parallel_dims: ParallelDims
-    model: list[nn.Module]
-    loss_fn: Optional[LossFunction]
-    optimizer: Optional[OptimizersContainer]
-    lr_scheduler: Optional[LRSchedulersContainer]
-    checkpointer: Optional[Checkpointer]
-    tokenizer: Optional[Tokenizer]
-    metric_logger: Optional[MetricLogger]
-    profiler: Optional[Profiler]
-    device: torch.device
-
-    def __init__(self, config: DictConfig):
-        """
-        Initialize the base actor with configuration.
-
-        Args:
-            config: Configuration dictionary containing job settings
-        """
-        job_config = ForgeJobConfig().to_dict()
-        job_config = OmegaConf.merge(job_config, config)
-
-        self.current_step = 0
-        self.metric_logger = None
-        self.gradient_accumulation_steps = 1
-        self._rank = current_rank().rank
-        self._size = math.prod(current_size().values())
-
-        self._init_dist()
-        super().__init__(job_config)
-
-    def _init_dist(self):
-        """
-        Initialize torch distributed environment.
-
-        Sets up environment variables required for distributed training
-        in the Monarch actor framework.
-        """
-        env = {
-            "RANK": str(self._rank),
-            "LOCAL_RANK": str(self._rank),
-            "LOCAL_WORLD_SIZE": str(self._size),
-            "GROUP_RANK": str(self._size),
-            "GROUP_WORLD_SIZE": str(self._size),
-            "ROLE_RANK": str(self._rank),
-            "ROLE_WORLD_SIZE": str(self._size),
-            "ROLE_NAME": "rank",
-            "WORLD_SIZE": str(self._size),
-            "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
-        }
-        os.environ.update(env)
-        logger.info(f"Initialized distributed environment: {env}")
-
-    @abstractmethod
-    async def setup(self):
-        """
-        Setup the actor (load data, checkpoint, etc.).
-
-        This method must be implemented by concrete actor classes.
-        """
-        pass
-
-    @abstractmethod
-    async def run(self):
-        """
-        Main execution logic for the actor.
-
-        This method must be implemented by concrete actor classes.
-        """
-        pass
-
-    @abstractmethod
-    async def cleanup(self):
-        """
-        Cleanup resources (close checkpointer, logger, etc.).
-
-        This method must be implemented by concrete actor classes.
-        """
-        pass
-
-    @abstractmethod
-    def __repr__(self) -> str:
-        """String representation of the actor."""
-        pass
diff --git a/apps/sft_v2/interactive_config_notebook.ipynb b/apps/sft_v2/interactive_config_notebook.ipynb
deleted file mode 100644
index 624f6a08a..000000000
--- a/apps/sft_v2/interactive_config_notebook.ipynb
+++ /dev/null
@@ -1,629 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "# SFT Training - Interactive Configuration Notebook\n",
-        "\n",
-        "This notebook allows you to configure and run SFT training **without any YAML files**!\n",
-        "\n",
-        "## Benefits\n",
-        "\n",
-        "✅ No external YAML files needed  \n",
-        "✅ Interactive configuration in separate cells  \n",
-        "✅ Easy to modify and experiment  \n",
-        "✅ All configuration visible in notebook  \n",
-        "✅ Quick templates for common scenarios"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Step 1: Import Dependencies"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "import asyncio\n",
-        "import logging\n",
-        "from omegaconf import OmegaConf, DictConfig\n",
-        "\n",
-        "from forge.apps.sft_v2.trainer_actor import TrainerActor\n",
-        "from forge.apps.sft_v2.spawn_actor import SpawnActor, run_actor\n",
-        "\n",
-        "logging.basicConfig(\n",
-        "    level=logging.INFO,\n",
-        "    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'\n",
-        ")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Step 2: Configure Model Settings\n",
-        "\n",
-        "Define your model configuration. **Modify these values as needed!**"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "model_config = {\n",
-        "    \"name\": \"llama3\",\n",
-        "    \"flavor\": \"8B\",\n",
-        "    \"hf_assets_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct\"\n",
-        "}\n",
-        "\n",
-        "print(\"Model Configuration:\")\n",
-        "print(OmegaConf.to_yaml(OmegaConf.create(model_config)))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Step 3: Configure Process Settings\n",
-        "\n",
-        "Define how many processes to use and whether to use GPUs."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "processes_config = {\n",
-        "    \"procs\": 8,        # Number of processes\n",
-        "    \"with_gpus\": True  # Use GPUs\n",
-        "}\n",
-        "\n",
-        "print(\"Process Configuration:\")\n",
-        "print(OmegaConf.to_yaml(OmegaConf.create(processes_config)))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Step 4: Configure Optimizer Settings"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "optimizer_config = {\n",
-        "    \"name\": \"AdamW\",\n",
-        "    \"lr\": 1e-5,    # Learning rate\n",
-        "    \"eps\": 1e-8\n",
-        "}\n",
-        "\n",
-        "print(\"Optimizer Configuration:\")\n",
-        "print(OmegaConf.to_yaml(OmegaConf.create(optimizer_config)))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Step 5: Configure Learning Rate Scheduler"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "lr_scheduler_config = {\n",
-        "    \"warmup_steps\": 200  # Number of warmup steps\n",
-        "}\n",
-        "\n",
-        "print(\"LR Scheduler Configuration:\")\n",
-        "print(OmegaConf.to_yaml(OmegaConf.create(lr_scheduler_config)))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Step 6: Configure Training Settings\n",
-        "\n",
-        "**Key parameters to adjust for your experiment:**"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "training_config = {\n",
-        "    \"local_batch_size\": 1,  # Batch size per GPU\n",
-        "    \"seq_len\": 2048,         # Sequence length\n",
-        "    \"max_norm\": 1.0,         # Gradient clipping\n",
-        "    \"steps\": 1000,           # Total training steps\n",
-        "    \"compile\": False,        # PyTorch compilation\n",
-        "    \"dataset\": \"c4\"          # Dataset name\n",
-        "}\n",
-        "\n",
-        "print(\"Training Configuration:\")\n",
-        "print(OmegaConf.to_yaml(OmegaConf.create(training_config)))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Step 7: Configure Parallelism Settings"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "parallelism_config = {\n",
-        "    \"data_parallel_replicate_degree\": 1,\n",
-        "    \"data_parallel_shard_degree\": -1,  # -1 means use all available GPUs for FSDP\n",
-        "    \"tensor_parallel_degree\": 1,\n",
-        "    \"pipeline_parallel_degree\": 1,\n",
-        "    \"context_parallel_degree\": 1,\n",
-        "    \"expert_parallel_degree\": 1,\n",
-        "    \"disable_loss_parallel\": False\n",
-        "}\n",
-        "\n",
-        "print(\"Parallelism Configuration:\")\n",
-        "print(OmegaConf.to_yaml(OmegaConf.create(parallelism_config)))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Step 8: Configure Checkpoint Settings"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "checkpoint_config = {\n",
-        "    \"enable\": True,\n",
-        "    \"folder\": \"/tmp/Meta-Llama-3.1-8B-Instruct/saved_checkpoints\",\n",
-        "    \"initial_load_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct/\",\n",
-        "    \"initial_load_in_hf\": True,\n",
-        "    \"last_save_in_hf\": True,\n",
-        "    \"interval\": 500,           # Save every N steps\n",
-        "    \"async_mode\": \"disabled\"\n",
-        "}\n",
-        "\n",
-        "print(\"Checkpoint Configuration:\")\n",
-        "print(OmegaConf.to_yaml(OmegaConf.create(checkpoint_config)))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Step 9: Configure Activation Checkpointing"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "activation_checkpoint_config = {\n",
-        "    \"mode\": \"selective\",\n",
-        "    \"selective_ac_option\": \"op\"\n",
-        "}\n",
-        "\n",
-        "print(\"Activation Checkpoint Configuration:\")\n",
-        "print(OmegaConf.to_yaml(OmegaConf.create(activation_checkpoint_config)))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Step 10: Configure Communication Settings"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "comm_config = {\n",
-        "    \"trace_buf_size\": 0\n",
-        "}\n",
-        "\n",
-        "print(\"Communication Configuration:\")\n",
-        "print(OmegaConf.to_yaml(OmegaConf.create(comm_config)))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Step 11: Combine All Configurations\n",
-        "\n",
-        "Now let's merge everything into a complete configuration!"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Combine all configs\n",
-        "complete_config = {\n",
-        "    \"comm\": comm_config,\n",
-        "    \"model\": model_config,\n",
-        "    \"processes\": processes_config,\n",
-        "    \"optimizer\": optimizer_config,\n",
-        "    \"lr_scheduler\": lr_scheduler_config,\n",
-        "    \"training\": training_config,\n",
-        "    \"parallelism\": parallelism_config,\n",
-        "    \"checkpoint\": checkpoint_config,\n",
-        "    \"activation_checkpoint\": activation_checkpoint_config\n",
-        "}\n",
-        "\n",
-        "# Create OmegaConf DictConfig\n",
-        "cfg = OmegaConf.create(complete_config)\n",
-        "\n",
-        "print(\"=\" * 80)\n",
-        "print(\"COMPLETE CONFIGURATION\")\n",
-        "print(\"=\" * 80)\n",
-        "print(OmegaConf.to_yaml(cfg))\n",
-        "print(\"=\" * 80)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Step 12: Run Training (Simple Way)\n",
-        "\n",
-        "The simplest way - automatic lifecycle management!"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Run training with automatic lifecycle management\n",
-        "await run_actor(TrainerActor, cfg)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Alternative: Manual Lifecycle Control\n",
-        "\n",
-        "For more control, manage each phase separately.\n",
-        "\n",
-        "### Create and Spawn the Actor"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Create the spawner\n",
-        "spawner = SpawnActor(TrainerActor, cfg)\n",
-        "\n",
-        "# Spawn the actor\n",
-        "actor = await spawner.spawn()\n",
-        "print(f\"✓ Actor spawned: {actor}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Setup the Actor"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Setup (load data, checkpoints, etc.)\n",
-        "await spawner.setup()\n",
-        "print(\"✓ Actor setup complete\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Run Training"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Run training\n",
-        "await spawner.run()\n",
-        "print(\"✓ Training complete\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "### Cleanup"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Cleanup resources\n",
-        "await spawner.cleanup()\n",
-        "print(\"✓ Cleanup complete\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "---\n",
-        "\n",
-        "# Quick Configuration Templates\n",
-        "\n",
-        "Here are ready-to-use templates for common scenarios!"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Template 1: Quick Test (Single GPU, Small Steps)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "quick_test_config = OmegaConf.create({\n",
-        "    \"comm\": {\"trace_buf_size\": 0},\n",
-        "    \"model\": {\n",
-        "        \"name\": \"llama3\",\n",
-        "        \"flavor\": \"8B\",\n",
-        "        \"hf_assets_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct\"\n",
-        "    },\n",
-        "    \"processes\": {\"procs\": 1, \"with_gpus\": True},\n",
-        "    \"optimizer\": {\"name\": \"AdamW\", \"lr\": 1e-5, \"eps\": 1e-8},\n",
-        "    \"lr_scheduler\": {\"warmup_steps\": 10},\n",
-        "    \"training\": {\n",
-        "        \"local_batch_size\": 1,\n",
-        "        \"seq_len\": 1024,\n",
-        "        \"max_norm\": 1.0,\n",
-        "        \"steps\": 100,  # Just 100 steps for quick testing\n",
-        "        \"compile\": False,\n",
-        "        \"dataset\": \"c4\"\n",
-        "    },\n",
-        "    \"parallelism\": {\n",
-        "        \"data_parallel_replicate_degree\": 1,\n",
-        "        \"data_parallel_shard_degree\": 1,\n",
-        "        \"tensor_parallel_degree\": 1,\n",
-        "        \"pipeline_parallel_degree\": 1,\n",
-        "        \"context_parallel_degree\": 1,\n",
-        "        \"expert_parallel_degree\": 1,\n",
-        "        \"disable_loss_parallel\": False\n",
-        "    },\n",
-        "    \"checkpoint\": {\n",
-        "        \"enable\": True,\n",
-        "        \"folder\": \"/tmp/quick_test_checkpoints\",\n",
-        "        \"initial_load_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct/\",\n",
-        "        \"initial_load_in_hf\": True,\n",
-        "        \"last_save_in_hf\": True,\n",
-        "        \"interval\": 50,\n",
-        "        \"async_mode\": \"disabled\"\n",
-        "    },\n",
-        "    \"activation_checkpoint\": {\n",
-        "        \"mode\": \"selective\",\n",
-        "        \"selective_ac_option\": \"op\"\n",
-        "    }\n",
-        "})\n",
-        "\n",
-        "print(\"Quick Test Configuration:\")\n",
-        "print(OmegaConf.to_yaml(quick_test_config))\n",
-        "\n",
-        "# To use: await run_actor(TrainerActor, quick_test_config)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Template 2: Multi-GPU Training (8 GPUs with FSDP)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "multi_gpu_config = OmegaConf.create({\n",
-        "    \"comm\": {\"trace_buf_size\": 0},\n",
-        "    \"model\": {\n",
-        "        \"name\": \"llama3\",\n",
-        "        \"flavor\": \"8B\",\n",
-        "        \"hf_assets_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct\"\n",
-        "    },\n",
-        "    \"processes\": {\"procs\": 8, \"with_gpus\": True},\n",
-        "    \"optimizer\": {\"name\": \"AdamW\", \"lr\": 2e-5, \"eps\": 1e-8},\n",
-        "    \"lr_scheduler\": {\"warmup_steps\": 200},\n",
-        "    \"training\": {\n",
-        "        \"local_batch_size\": 2,\n",
-        "        \"seq_len\": 2048,\n",
-        "        \"max_norm\": 1.0,\n",
-        "        \"steps\": 5000,\n",
-        "        \"compile\": False,\n",
-        "        \"dataset\": \"c4\"\n",
-        "    },\n",
-        "    \"parallelism\": {\n",
-        "        \"data_parallel_replicate_degree\": 1,\n",
-        "        \"data_parallel_shard_degree\": 8,  # FSDP across 8 GPUs\n",
-        "        \"tensor_parallel_degree\": 1,\n",
-        "        \"pipeline_parallel_degree\": 1,\n",
-        "        \"context_parallel_degree\": 1,\n",
-        "        \"expert_parallel_degree\": 1,\n",
-        "        \"disable_loss_parallel\": False\n",
-        "    },\n",
-        "    \"checkpoint\": {\n",
-        "        \"enable\": True,\n",
-        "        \"folder\": \"/tmp/multi_gpu_checkpoints\",\n",
-        "        \"initial_load_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct/\",\n",
-        "        \"initial_load_in_hf\": True,\n",
-        "        \"last_save_in_hf\": True,\n",
-        "        \"interval\": 500,\n",
-        "        \"async_mode\": \"disabled\"\n",
-        "    },\n",
-        "    \"activation_checkpoint\": {\n",
-        "        \"mode\": \"selective\",\n",
-        "        \"selective_ac_option\": \"op\"\n",
-        "    }\n",
-        "})\n",
-        "\n",
-        "print(\"Multi-GPU Configuration:\")\n",
-        "print(OmegaConf.to_yaml(multi_gpu_config))\n",
-        "\n",
-        "# To use: await run_actor(TrainerActor, multi_gpu_config)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Template 3: Memory-Efficient Training"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "memory_efficient_config = OmegaConf.create({\n",
-        "    \"comm\": {\"trace_buf_size\": 0},\n",
-        "    \"model\": {\n",
-        "        \"name\": \"llama3\",\n",
-        "        \"flavor\": \"8B\",\n",
-        "        \"hf_assets_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct\"\n",
-        "    },\n",
-        "    \"processes\": {\"procs\": 4, \"with_gpus\": True},\n",
-        "    \"optimizer\": {\"name\": \"AdamW\", \"lr\": 1e-5, \"eps\": 1e-8},\n",
-        "    \"lr_scheduler\": {\"warmup_steps\": 150},\n",
-        "    \"training\": {\n",
-        "        \"local_batch_size\": 1,  # Small batch size\n",
-        "        \"seq_len\": 1024,         # Shorter sequence\n",
-        "        \"max_norm\": 1.0,\n",
-        "        \"steps\": 2000,\n",
-        "        \"compile\": False,\n",
-        "        \"dataset\": \"c4\"\n",
-        "    },\n",
-        "    \"parallelism\": {\n",
-        "        \"data_parallel_replicate_degree\": 1,\n",
-        "        \"data_parallel_shard_degree\": 4,\n",
-        "        \"tensor_parallel_degree\": 1,\n",
-        "        \"pipeline_parallel_degree\": 1,\n",
-        "        \"context_parallel_degree\": 1,\n",
-        "        \"expert_parallel_degree\": 1,\n",
-        "        \"disable_loss_parallel\": False\n",
-        "    },\n",
-        "    \"checkpoint\": {\n",
-        "        \"enable\": True,\n",
-        "        \"folder\": \"/tmp/memory_efficient_checkpoints\",\n",
-        "        \"initial_load_path\": \"/tmp/Meta-Llama-3.1-8B-Instruct/\",\n",
-        "        \"initial_load_in_hf\": True,\n",
-        "        \"last_save_in_hf\": True,\n",
-        "        \"interval\": 400,\n",
-        "        \"async_mode\": \"disabled\"\n",
-        "    },\n",
-        "    \"activation_checkpoint\": {\n",
-        "        \"mode\": \"selective\",  # Saves memory\n",
-        "        \"selective_ac_option\": \"op\"\n",
-        "    }\n",
-        "})\n",
-        "\n",
-        "print(\"Memory-Efficient Configuration:\")\n",
-        "print(OmegaConf.to_yaml(memory_efficient_config))\n",
-        "\n",
-        "# To use: await run_actor(TrainerActor, memory_efficient_config)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "---\n",
-        "\n",
-        "# Tips & Tricks\n",
-        "\n",
-        "## Memory Optimization\n",
-        "- ⬇️ Reduce `seq_len` if running out of memory\n",
-        "- ⬇️ Reduce `local_batch_size` if running out of memory\n",
-        "- ✅ Enable `activation_checkpoint` for memory savings\n",
-        "\n",
-        "## Training Speed\n",
-        "- ⬆️ Increase `local_batch_size` for faster training (if memory allows)\n",
-        "- 🚀 Use multiple GPUs with FSDP (`data_parallel_shard_degree > 1`)\n",
-        "- ⚡ Enable `compile: true` for PyTorch compilation (experimental)\n",
-        "\n",
-        "## Debugging\n",
-        "- 🧪 Start with small `steps` (e.g., 10-100) to test quickly\n",
-        "- 🔍 Use single GPU first (`procs: 1`)\n",
-        "- 📊 Monitor loss values in logs\n",
-        "\n",
-        "## Checkpoint Management\n",
-        "- 💾 Set `interval` based on how often you want to save\n",
-        "- 📁 Ensure `folder` path exists and has enough space\n",
-        "- 🔄 Use `initial_load_path` to resume from checkpoints"
-      ]
-    }
-  ],
-  "metadata": {
-    "orig_nbformat": 4
-  },
-  "nbformat": 4,
-  "nbformat_minor": 2
-}
diff --git a/apps/sft_v2/spawn_actor.py b/apps/sft_v2/spawn_actor.py
deleted file mode 100644
index eb9695c76..000000000
--- a/apps/sft_v2/spawn_actor.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-SpawnActor - Orchestrates the spawning and lifecycle management of actors.
-
-This module provides a high-level interface for creating, setting up, running,
-and cleaning up different types of actors (e.g., Trainer, Evaluator, etc.)
-"""
-
-import logging
-from typing import Any, Type
-
-from forge.apps.sft_v2.actor import BaseForgeActor
-from omegaconf import DictConfig
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-
-class SpawnActor:
-    """
-    Orchestrator for spawning and managing actor lifecycles.
-
-    This class handles the creation, setup, execution, and cleanup of actors
-    in a standardized way.
-    """
-
-    def __init__(self, actor_class: Type[BaseForgeActor], config: DictConfig):
-        """
-        Initialize the spawn actor orchestrator.
-
-        Args:
-            actor_class: The actor class to instantiate (must inherit from BaseForgeActor)
-            config: Configuration dictionary for the actor
-        """
-        self.actor_class = actor_class
-        self.config = config
-        self.actor = None
-
-        if not issubclass(actor_class, BaseForgeActor):
-            raise TypeError(
-                f"actor_class must be a subclass of BaseForgeActor, got {actor_class}"
-            )
-
-    async def spawn(self) -> Any:
-        """
-        Spawn the actor instance with the given configuration.
-
-        Returns:
-            The spawned actor instance
-        """
-        logger.info(f"Spawning {self.actor_class.__name__}...")
-
-        process_cfg = self.config.pop("processes", {})
-
-        self.actor = await self.actor_class.options(**process_cfg).as_actor(self.config)
-
-        logger.info(f"{self.actor_class.__name__} spawned successfully.")
-        return self.actor
-
-    async def setup(self):
-        """
-        Setup the spawned actor (load data, checkpoints, etc.).
-        """
-        if self.actor is None:
-            raise RuntimeError(
-                "Actor must be spawned before setup. Call spawn() first."
-            )
-
-        logger.info(f"Setting up {self.actor_class.__name__}...")
-        await self.actor.setup.call()
-        logger.info(f"{self.actor_class.__name__} setup complete.")
-
-    async def run(self):
-        """
-        Run the main execution logic of the actor.
-        """
-        if self.actor is None:
-            raise RuntimeError(
-                "Actor must be spawned before running. Call spawn() first."
-            )
-
-        logger.info(f"Running {self.actor_class.__name__}...")
-        await self.actor.run.call()
-        logger.info(f"{self.actor_class.__name__} execution complete.")
-
-    async def cleanup(self):
-        """
-        Cleanup the actor resources and stop the mesh.
-        """
-        if self.actor is None:
-            raise RuntimeError(
-                "Actor must be spawned before cleanup. Call spawn() first."
-            )
-
-        logger.info(f"Cleaning up {self.actor_class.__name__}...")
-        await self.actor.cleanup.call()
-
-        if hasattr(self.actor, "mesh"):
-            await self.actor.mesh.stop()
-
-        logger.info(f"{self.actor_class.__name__} cleanup complete.")
-
-    async def run_full_lifecycle(self):
-        """
-        Execute the complete actor lifecycle: spawn -> setup -> run -> cleanup.
-
-        This is a convenience method that runs all phases in sequence.
-        """
-        logger.info(f"Starting full lifecycle for {self.actor_class.__name__}...")
-
-        try:
-            await self.spawn()
-            await self.setup()
-            await self.run()
-        finally:
-            if self.actor is not None:
-                await self.cleanup()
-
-        logger.info(f"Full lifecycle complete for {self.actor_class.__name__}.")
-
-
-async def run_actor(
-    actor_class: Type[BaseForgeActor],
-    config: DictConfig,
-) -> None:
-    """
-    Convenience function to run an actor with full lifecycle management.
-
-    Args:
-        actor_class: The actor class to instantiate
-        config: Configuration dictionary for the actor
-    """
-    spawner = SpawnActor(actor_class, config)
-    await spawner.run_full_lifecycle()
diff --git a/apps/sft_v2/trainer_actor.py b/apps/sft_v2/trainer_actor.py
deleted file mode 100644
index 10c5e9b38..000000000
--- a/apps/sft_v2/trainer_actor.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Trainer actor implementation for SFT training.
-
-This is a concrete implementation of BaseForgeActor for supervised fine-tuning.
-"""
-
-import logging
-
-import torch
-import torchtitan.experiments.forge.train_spec as forge_train_spec
-from forge.apps.sft_v2.actor import BaseForgeActor
-from forge.apps.sft_v2.utils import (
-    create_context_parallel_context,
-    log_training_step,
-    move_batch_to_device,
-    setup_sft_dataloader,
-    setup_tokenizer,
-)
-from monarch.actor import endpoint
-from omegaconf import DictConfig
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-
-class TrainerActor(BaseForgeActor):
-    """
-    Concrete trainer actor for supervised fine-tuning.
-
-    Handles training loop, forward/backward passes, and checkpoint management.
-    """
-
-    train_spec: forge_train_spec.ForgeTrainSpec
-    train_dataloader: any
-    num_training_steps: int
-
-    def __init__(self, config: DictConfig):
-        """
-        Initialize the trainer actor.
-
-        Args:
-            config: Configuration dictionary containing training settings
-        """
-        super().__init__(config)
-        self.num_training_steps = self.job_config.training.steps
-
-    @endpoint
-    async def setup(self):
-        """
-        Setup the trainer (load data, checkpoint, etc.).
-        """
-        logger.info("Setting up trainer actor...")
-
-        self.tokenizer = setup_tokenizer(
-            hf_assets_path=self.job_config.model.hf_assets_path
-        )
-
-        self.train_dataloader = setup_sft_dataloader(
-            tokenizer=self.tokenizer,
-            dataset_path="yahma/alpaca-cleaned",
-            dataset_split="train",
-            target_tokens_per_pack=self.job_config.training.seq_len,
-            batch_size=self.job_config.training.local_batch_size,
-            device=self.device,
-        )
-
-        if self.checkpointer:
-            logger.info("Loading checkpoint...")
-            self.checkpointer.load(step=self.current_step)
-
-        logger.info("Trainer setup complete.")
-
-    def forward_backward(
-        self, input_dict: dict[str, torch.Tensor], labels: torch.Tensor
-    ) -> torch.Tensor:
-        """
-        Perform forward and backward pass.
-
-        Args:
-            input_dict: Dictionary containing input tokens
-            labels: Ground truth labels
-
-        Returns:
-            Computed loss value
-        """
-        model_parts = self.model_parts
-        parallel_dims = self.parallel_dims
-        inputs = input_dict["tokens"]
-
-        optional_context_parallel_ctx = create_context_parallel_context(
-            parallel_dims=parallel_dims,
-            inputs=inputs,
-            labels=labels,
-            model_parts=model_parts,
-            rotate_method=self.job_config.parallelism.context_parallel_rotate_method,
-        )
-
-        if parallel_dims.pp_enabled:
-            with self.train_context(optional_context_parallel_ctx):
-                targets, losses = (
-                    (labels, []) if self.pp_has_last_stage else (None, None)
-                )
-                if self.pp_has_first_stage:
-                    self.pp_schedule.step(
-                        inputs, target=targets, losses=losses, input_batch=inputs
-                    )
-                else:
-                    self.pp_schedule.step(
-                        target=targets, losses=losses, input_batch=inputs
-                    )
-
-            loss = (
-                torch.mean(torch.stack(losses)).to(self.device)
-                if self.pp_has_last_stage
-                else torch.tensor([-1.0], device=self.device)
-            )
-        else:
-            with self.train_context(optional_context_parallel_ctx):
-                assert len(model_parts) == 1
-                with self.maybe_enable_amp:
-                    pred = model_parts[0](inputs)
-                    loss = self.loss_fn(pred, labels)
-                del pred
-                loss.backward()
-
-        return loss
-
-    def train_step(self, batch: dict[str, torch.Tensor]) -> None:
-        """
-        Execute a single training step.
-
-        Args:
-            batch: Dictionary containing batch data (tokens, labels, etc.)
-        """
-        labels = batch.pop("labels")
-        loss = self.forward_backward(batch, labels)
-
-        log_training_step(self.current_step, self.num_training_steps, loss, logger)
-
-        self.optimizers.step()
-        self.lr_schedulers.step()
-
-    @endpoint
-    async def run(self) -> None:
-        """
-        Main training loop.
-        """
-        logger.info("Starting training loop...")
-
-        dataloader = iter(self.train_dataloader)
-        self.optimizers.zero_grad()
-
-        while self.current_step < self.num_training_steps:
-            batch = next(dataloader)
-            batch = move_batch_to_device(batch, self.device)
-
-            self.train_step(batch)
-            self.current_step += 1
-
-            if self.checkpointer:
-                self.checkpointer.save(
-                    curr_step=self.current_step,
-                    last_step=self.current_step == self.num_training_steps,
-                )
-
-        logger.info("Training complete!")
-
-    @endpoint
-    async def cleanup(self) -> None:
-        """
-        Cleanup resources (close checkpointer, logger, etc.).
-        """
-        logger.info("Cleaning up trainer actor...")
-
-        if self.checkpointer:
-            self.checkpointer.close()
-        if self.metric_logger:
-            self.metric_logger.close()
-
-        logger.info("Cleanup complete.")
-
-    def __repr__(self) -> str:
-        return "TrainerActor"
diff --git a/apps/sft_v2/utils.py b/apps/sft_v2/utils.py
deleted file mode 100644
index 6d0219805..000000000
--- a/apps/sft_v2/utils.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Utility functions for SFT training actors.
-
-These utilities handle data loading, model setup, and common operations.
-"""
-
-import logging
-import os
-from functools import partial
-from typing import Any, Optional
-
-import torch
-from forge.data.collate import collate_packed
-from forge.data.datasets.packed import PackedDataset, TextPacker
-from forge.data.datasets.sft_dataset import AlpacaToMessages, sft_iterable_dataset
-from forge.data.tokenizer import HuggingFaceModelTokenizer
-from torchdata.stateful_dataloader import StatefulDataLoader
-from torchtitan.distributed import ParallelDims, utils as dist_utils
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-
-def setup_tokenizer(
-    hf_assets_path: str,
-    tokenizer_filename: str = "tokenizer.json",
-    tokenizer_config_filename: str = "tokenizer_config.json",
-    generation_config_filename: str = "generation_config.json",
-) -> HuggingFaceModelTokenizer:
-    """
-    Setup HuggingFace tokenizer from model assets.
-
-    Args:
-        hf_assets_path: Path to the directory containing tokenizer files
-        tokenizer_filename: Name of the tokenizer JSON file
-        tokenizer_config_filename: Name of the tokenizer config JSON file
-        generation_config_filename: Name of the generation config JSON file
-
-    Returns:
-        Initialized HuggingFaceModelTokenizer
-    """
-    tokenizer_json_path = os.path.join(hf_assets_path, tokenizer_filename)
-    tokenizer_config_path = os.path.join(hf_assets_path, tokenizer_config_filename)
-    generation_config_path = os.path.join(hf_assets_path, generation_config_filename)
-
-    logger.info(f"Loading tokenizer from: {tokenizer_json_path}")
-
-    tokenizer = HuggingFaceModelTokenizer(
-        tokenizer_json_path=tokenizer_json_path,
-        tokenizer_config_json_path=tokenizer_config_path,
-        generation_config_path=generation_config_path,
-    )
-
-    return tokenizer
-
-
-def setup_sft_dataloader(
-    tokenizer: HuggingFaceModelTokenizer,
-    dataset_path: str,
-    dataset_split: str,
-    target_tokens_per_pack: int,
-    batch_size: int,
-    device: torch.device,
-    padding_idx: int = 0,
-    message_transform: Optional[Any] = None,
-) -> StatefulDataLoader:
-    """
-    Setup dataloader for SFT training.
-
-    Args:
-        tokenizer: Tokenizer to use for processing text
-        dataset_path: Path or name of the dataset (e.g., "yahma/alpaca-cleaned")
-        dataset_split: Dataset split to use (e.g., "train", "validation")
-        target_tokens_per_pack: Target sequence length for packing
-        batch_size: Batch size for training
-        device: Device to move tensors to
-        padding_idx: Padding token index
-        message_transform: Transform to convert dataset format to messages
-
-    Returns:
-        Configured StatefulDataLoader
-    """
-    if message_transform is None:
-        message_transform = AlpacaToMessages()
-
-    logger.info(f"Loading SFT dataset from: {dataset_path}, split: {dataset_split}")
-
-    dataset = sft_iterable_dataset(
-        model_transform=tokenizer,
-        message_transform=message_transform,
-        path=dataset_path,
-        split=dataset_split,
-    )
-
-    packer = TextPacker(padding_idx=padding_idx)
-    dataset = PackedDataset(
-        dataset=dataset,
-        packer=packer,
-        target_tokens_per_pack=target_tokens_per_pack,
-    )
-
-    dataloader = StatefulDataLoader(
-        dataset=dataset,
-        batch_size=batch_size,
-        collate_fn=partial(
-            collate_packed, mask_fn=packer.create_block_mask, device=device
-        ),
-    )
-
-    logger.info(
-        f"Created dataloader with batch_size={batch_size}, target_tokens={target_tokens_per_pack}"
-    )
-
-    return dataloader
-
-
-def create_context_parallel_context(
-    parallel_dims: ParallelDims,
-    inputs: torch.Tensor,
-    labels: torch.Tensor,
-    model_parts: list,
-    rotate_method: str,
-):
-    """
-    Create context parallel context for distributed training.
-
-    Args:
-        parallel_dims: Parallel dimensions configuration
-        inputs: Input tensor
-        labels: Label tensor
-        model_parts: List of model parts
-        rotate_method: Context parallel rotation method
-
-    Returns:
-        Context parallel context or None if CP is not enabled
-    """
-    if not parallel_dims.cp_enabled:
-        return None
-
-    return dist_utils.create_context_parallel_ctx(
-        cp_mesh=parallel_dims.world_mesh["cp"],
-        cp_buffers=[inputs, labels] + [m.freqs_cis for m in model_parts],
-        cp_seq_dims=[1, 1] + [0 for _ in model_parts],
-        cp_no_restore_buffers={inputs, labels},
-        cp_rotate_method=rotate_method,
-    )
-
-
-def move_batch_to_device(batch: dict[str, Any], device: torch.device) -> dict[str, Any]:
-    """
-    Move batch tensors to the specified device.
-
-    Args:
-        batch: Dictionary containing batch data
-        device: Target device
-
-    Returns:
-        Batch with tensors moved to device
-    """
-    for key, value in batch.items():
-        if isinstance(value, torch.Tensor):
-            batch[key] = value.to(device)
-    return batch
-
-
-def log_training_step(
-    step: int,
-    total_steps: int,
-    loss: torch.Tensor,
-    logger: logging.Logger,
-):
-    """
-    Log training step information.
-
-    Args:
-        step: Current training step
-        total_steps: Total number of training steps
-        loss: Current loss value
-        logger: Logger instance
-    """
-    logger.info(f"Step {step}/{total_steps} | Loss: {loss.item():.4f}")

From 53371c63fd61b7cd20989e13afb985bc345c8aae Mon Sep 17 00:00:00 2001
From: Hossein Kavianihamedani <hosseinkh@fb.com>
Date: Thu, 16 Oct 2025 14:40:50 -0700
Subject: [PATCH 4/7] Implement Epoch-Based Evaluation with Non-Blocking
 All-Reduce

---
 apps/sft/main.py          | 122 +++++++++--
 apps/sft/test_evaluate.py | 437 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 542 insertions(+), 17 deletions(-)
 create mode 100644 apps/sft/test_evaluate.py

diff --git a/apps/sft/main.py b/apps/sft/main.py
index 97ed4125e..7d6cfc665 100644
--- a/apps/sft/main.py
+++ b/apps/sft/main.py
@@ -279,8 +279,29 @@ def train_step(self, batch) -> None:
         self.optimizers.step()
         self.lr_schedulers.step()
 
+    def _extract_epoch_from_batch(self, batch: dict) -> int | None:
+        """Extract epoch number from batch metrics."""
+        if "metrics" not in batch:
+            return None
+
+        for metric in batch["metrics"]:
+            if hasattr(metric, "metric_name") and metric.metric_name == "num_epochs":
+                return metric.value
+        return None
+
     async def evaluate(self) -> dict[str, float]:
-        """Run evaluation on validation set (internal method, not an endpoint)."""
+        """Run evaluation on validation set for one complete epoch.
+
+        Uses prefetch + non-blocking all_reduce pattern to detect epoch completion
+        across all ranks without blocking on every batch.
+
+        Pattern:
+        - Iteration N: Start async all_reduce on next batch's epoch (non-blocking)
+        - Process current batch while all_reduce completes in background
+        - Iteration N+1: Check result from previous all_reduce (should be done)
+
+        This overlaps communication with computation for better performance.
+        """
         logger.info("=" * 50)
         logger.info("STARTING EVALUATION ")
         logger.info("=" * 50)
@@ -292,30 +313,97 @@ async def evaluate(self) -> dict[str, float]:
         val_dataloader = iter(self.val_dataloader)
         total_loss = 0.0
         num_batches = 0
+        starting_epoch = None
+
+        # Prefetch first batch
+        try:
+            next_batch = next(val_dataloader)
+        except StopIteration:
+            logger.warning("Validation dataloader is empty")
+            return {"val_loss": 0.0, "val_batches": 0}
+
+        next_should_break = False
+        pending_work = None  # Handle for async all_reduce
+        epoch_tensor = None  # Tensor for all_reduce result
 
         with torch.no_grad():
-            for step in range(self.eval_steps):
-                try:
-                    batch = next(val_dataloader)
+            while True:
+                # Check result from PREVIOUS iteration's async all_reduce
+                if pending_work is not None:
+                    pending_work.wait()  # Should be complete (or very fast) since we did compute
+                    if epoch_tensor is not None:
+                        next_should_break = epoch_tensor.item() > 0
+                    pending_work = None
+
+                # Check if we should break (based on previous iteration's check)
+                if next_should_break:
+                    logger.info(
+                        "Epoch completed across all ranks - stopping evaluation"
+                    )
+                    break
 
-                    # Move tensors to device
-                    for k, v in batch.items():
-                        if isinstance(v, torch.Tensor):
-                            batch[k] = v.to(self.device)
+                # Check optional cap on eval steps
+                if self.eval_steps > 0 and num_batches >= self.eval_steps:
+                    logger.info(f"Reached eval_steps cap of {self.eval_steps}")
+                    break
 
-                    labels = batch.pop("labels")
-                    loss = self.forward_only(batch, labels)
+                # Use the batch that was prefetched in previous iteration
+                batch = next_batch
 
-                    total_loss += loss.item()
-                    num_batches += 1
+                # Extract epoch from current batch
+                current_epoch = self._extract_epoch_from_batch(batch)
+                if current_epoch is not None and starting_epoch is None:
+                    starting_epoch = current_epoch
+                    logger.info(f"Starting evaluation at epoch {starting_epoch}")
 
-                    logger.info(
-                        f"  Eval batch {num_batches}/{self.eval_steps} | Loss: {loss.item():.4f}"
-                    )
+                # Prefetch next batch and start async all_reduce
+                try:
+                    next_batch = next(val_dataloader)
+
+                    # Extract epoch from next batch
+                    next_epoch = self._extract_epoch_from_batch(next_batch)
+
+                    # Start NON-BLOCKING all_reduce to check if any rank completed epoch
+                    if next_epoch is not None and starting_epoch is not None:
+                        # Check if next batch indicates epoch completion
+                        epoch_increment = next_epoch - starting_epoch
+
+                        if torch.distributed.is_initialized():
+                            # Create tensor for all_reduce
+                            epoch_tensor = torch.tensor(
+                                [epoch_increment], dtype=torch.long, device=self.device
+                            )
+                            # Start async all_reduce (returns immediately, doesn't block)
+                            pending_work = torch.distributed.all_reduce(
+                                epoch_tensor,
+                                op=torch.distributed.ReduceOp.MAX,
+                                async_op=True,  # NON-BLOCKING - returns immediately
+                            )
+                        else:
+                            # Single rank case - just check locally
+                            next_should_break = epoch_increment > 0
 
                 except StopIteration:
-                    logger.warning("Reached end of validation dataloader early")
-                    break
+                    # No more batches - this is the last one
+                    next_should_break = True
+
+                # Process current batch (while all_reduce completes in background)
+                # Move tensors to device
+                for k, v in batch.items():
+                    if isinstance(v, torch.Tensor):
+                        batch[k] = v.to(self.device)
+
+                labels = batch.pop("labels")
+                loss = self.forward_only(batch, labels)
+                # GPU compute happens here while network does all_reduce
+
+                total_loss += loss.item()
+                num_batches += 1
+
+                eval_steps_info = f"/{self.eval_steps}" if self.eval_steps > 0 else ""
+                logger.info(
+                    f"  Eval batch {num_batches}{eval_steps_info} | Loss: {loss.item():.4f}"
+                )
 
         # Set model back to train mode
         for model_part in self.model_parts:
diff --git a/apps/sft/test_evaluate.py b/apps/sft/test_evaluate.py
new file mode 100644
index 000000000..57959b09d
--- /dev/null
+++ b/apps/sft/test_evaluate.py
@@ -0,0 +1,437 @@
+"""
+Tests for the non-blocking all_reduce evaluation logic in main.py
+
+This tests the epoch-detection and async all_reduce pattern used to
+synchronize evaluation completion across multiple ranks without blocking.
+"""
+
+from dataclasses import dataclass
+from unittest.mock import MagicMock, Mock, patch
+
+import pytest
+import torch
+
+
+@dataclass
+class MockMetric:
+    """Mock metric object matching the structure in batch["metrics"]"""
+
+    metric_name: str
+    value: int
+
+
+class MockTrainer:
+    """Mock trainer with minimal setup for testing evaluate logic"""
+
+    def __init__(self, eval_steps=0):
+        self.eval_steps = eval_steps
+        self.device = torch.device("cpu")
+        self.model_parts = [Mock()]
+
+    def _extract_epoch_from_batch(self, batch: dict) -> int | None:
+        """Extract epoch number from batch metrics."""
+        if "metrics" not in batch:
+            return None
+
+        for metric in batch["metrics"]:
+            if hasattr(metric, "metric_name") and metric.metric_name == "num_epochs":
+                return metric.value
+        return None
+
+    def forward_only(self, batch, labels):
+        """Mock forward pass - returns dummy loss"""
+        return torch.tensor(1.5)
+
+
+def create_batch_with_epoch(epoch: int, loss_value: float = 1.5):
+    """Helper to create a mock batch with epoch metadata"""
+    return {
+        "input_ids": torch.randn(2, 10),
+        "attention_mask": torch.ones(2, 10),
+        "labels": torch.randint(0, 100, (2, 10)),
+        "metrics": [MockMetric(metric_name="num_epochs", value=epoch)],
+    }
+
+
+def create_batch_without_epoch(loss_value: float = 1.5):
+    """Helper to create a mock batch without epoch metadata"""
+    return {
+        "input_ids": torch.randn(2, 10),
+        "attention_mask": torch.ones(2, 10),
+        "labels": torch.randint(0, 100, (2, 10)),
+    }
+
+
+class TestExtractEpochFromBatch:
+    """Test the _extract_epoch_from_batch helper method"""
+
+    def test_extract_epoch_success(self):
+        """Test extracting epoch from batch with proper metadata"""
+        trainer = MockTrainer()
+        batch = create_batch_with_epoch(epoch=5)
+
+        epoch = trainer._extract_epoch_from_batch(batch)
+        assert epoch == 5
+
+    def test_extract_epoch_no_metrics(self):
+        """Test batch without metrics returns None"""
+        trainer = MockTrainer()
+        batch = create_batch_without_epoch()
+
+        epoch = trainer._extract_epoch_from_batch(batch)
+        assert epoch is None
+
+    def test_extract_epoch_wrong_metric_name(self):
+        """Test batch with metrics but wrong metric_name returns None"""
+        trainer = MockTrainer()
+        batch = {
+            "input_ids": torch.randn(2, 10),
+            "metrics": [MockMetric(metric_name="other_metric", value=10)],
+        }
+
+        epoch = trainer._extract_epoch_from_batch(batch)
+        assert epoch is None
+
+    def test_extract_epoch_multiple_metrics(self):
+        """Test extracting epoch from batch with multiple metrics"""
+        trainer = MockTrainer()
+        batch = {
+            "input_ids": torch.randn(2, 10),
+            "metrics": [
+                MockMetric(metric_name="loss", value=1.5),
+                MockMetric(metric_name="num_epochs", value=3),
+                MockMetric(metric_name="step", value=100),
+            ],
+        }
+
+        epoch = trainer._extract_epoch_from_batch(batch)
+        assert epoch == 3
+
+
+class TestEvaluationLogic:
+    """Test the evaluation loop logic (single-rank scenario)"""
+
+    @pytest.mark.asyncio
+    async def test_single_epoch_completion(self):
+        """Test that evaluation stops after one complete epoch"""
+        trainer = MockTrainer(eval_steps=0)  # No cap
+
+        # Create batches: 3 from epoch 0, then epoch increments to 1
+        batches = [
+            create_batch_with_epoch(0),
+            create_batch_with_epoch(0),
+            create_batch_with_epoch(0),
+            create_batch_with_epoch(1),  # Epoch increment - should trigger stop
+        ]
+
+        dataloader = iter(batches)
+
+        # Simulate the evaluation pattern
+        num_processed = 0
+        starting_epoch = None
+        next_should_break = False
+
+        # Get first batch
+        next_batch = next(dataloader)
+
+        while True:
+            if next_should_break:
+                break
+
+            batch = next_batch
+
+            # Extract epoch from current batch
+            current_epoch = trainer._extract_epoch_from_batch(batch)
+            if current_epoch is not None and starting_epoch is None:
+                starting_epoch = current_epoch
+
+            # Try to prefetch next batch
+            try:
+                next_batch = next(dataloader)
+                next_epoch = trainer._extract_epoch_from_batch(next_batch)
+
+                # Check for epoch increment
+                if next_epoch is not None and starting_epoch is not None:
+                    epoch_increment = next_epoch - starting_epoch
+                    next_should_break = epoch_increment > 0
+
+            except StopIteration:
+                next_should_break = True
+
+            # Process current batch
+            num_processed += 1
+
+        # Should have processed 3 batches (stopped when detected epoch 1)
+        assert num_processed == 3
+        assert starting_epoch == 0
+
+    @pytest.mark.asyncio
+    async def test_eval_steps_cap(self):
+        """Test that evaluation respects eval_steps cap"""
+        trainer = MockTrainer(eval_steps=2)  # Cap at 2 batches
+
+        # Create 5 batches all in same epoch
+        batches = [create_batch_with_epoch(0) for _ in range(5)]
+        dataloader = iter(batches)
+
+        # Simulate the evaluation pattern
+        num_processed = 0
+        next_should_break = False
+
+        # Get first batch
+        next_batch = next(dataloader)
+
+        while True:
+            if next_should_break:
+                break
+
+            # Check eval_steps cap
+            if trainer.eval_steps > 0 and num_processed >= trainer.eval_steps:
+                break
+
+            batch = next_batch
+
+            # Try to prefetch next batch
+            try:
+                next_batch = next(dataloader)
+            except StopIteration:
+                next_should_break = True
+
+            # Process current batch
+            num_processed += 1
+
+        # Should have processed exactly 2 batches (eval_steps cap)
+        assert num_processed == 2
+
+    @pytest.mark.asyncio
+    async def test_empty_dataloader(self):
+        """Test handling of empty dataloader"""
+        trainer = MockTrainer(eval_steps=0)
+
+        batches = []
+        dataloader = iter(batches)
+
+        # Should raise StopIteration immediately
+        with pytest.raises(StopIteration):
+            next_batch = next(dataloader)
+
+    @pytest.mark.asyncio
+    async def test_single_batch(self):
+        """Test evaluation with only one batch"""
+        trainer = MockTrainer(eval_steps=0)
+
+        batches = [create_batch_with_epoch(0)]
+        dataloader = iter(batches)
+
+        num_processed = 0
+        next_should_break = False
+
+        # Get first batch
+        next_batch = next(dataloader)
+
+        while True:
+            if next_should_break:
+                break
+
+            batch = next_batch
+
+            # Try to prefetch next batch
+            try:
+                next_batch = next(dataloader)
+            except StopIteration:
+                next_should_break = True
+
+            # Process current batch
+            num_processed += 1
+
+        # Should have processed 1 batch
+        assert num_processed == 1
+
+    @pytest.mark.asyncio
+    async def test_no_epoch_metadata(self):
+        """Test evaluation when batches don't have epoch metadata"""
+        trainer = MockTrainer(eval_steps=3)  # Use eval_steps as fallback
+
+        # Create batches without epoch metadata
+        batches = [create_batch_without_epoch() for _ in range(5)]
+        dataloader = iter(batches)
+
+        num_processed = 0
+        next_should_break = False
+        next_batch = next(dataloader)
+
+        while True:
+            if next_should_break:
+                break
+
+            # Check eval_steps cap (should be the stopping condition)
+            if trainer.eval_steps > 0 and num_processed >= trainer.eval_steps:
+                break
+
+            batch = next_batch
+
+            try:
+                next_batch = next(dataloader)
+            except StopIteration:
+                next_should_break = True
+
+            num_processed += 1
+
+        # Should stop at eval_steps
+        assert num_processed == 3
+
+
+class TestAsyncAllReduce:
+    """Test the async all_reduce pattern with mocked distributed operations"""
+
+    @pytest.mark.asyncio
+    async def test_async_all_reduce_pattern(self):
+        """Test the async all_reduce pattern with mock distributed operations"""
+
+        # Mock distributed environment
+        with patch("torch.distributed.is_initialized", return_value=True):
+            with patch("torch.distributed.all_reduce") as mock_all_reduce:
+
+                # Create mock Work handle for async operation
+                mock_work = Mock()
+                mock_work.wait = Mock()
+                mock_all_reduce.return_value = mock_work
+
+                trainer = MockTrainer(eval_steps=0)
+
+                # Simulate the async pattern
+                epoch_tensor = torch.tensor([0], dtype=torch.long)
+
+                # Start async all_reduce (should return immediately)
+                work_handle = torch.distributed.all_reduce(
+                    epoch_tensor, op=torch.distributed.ReduceOp.MAX, async_op=True
+                )
+
+                # Verify it returned immediately with a work handle
+                assert work_handle is not None
+                assert mock_all_reduce.called
+
+                # Simulate doing computation here...
+
+                # Wait for completion
+                work_handle.wait()
+                assert mock_work.wait.called
+
+    @pytest.mark.asyncio
+    async def test_multi_rank_epoch_detection(self):
+        """Test that epoch completion is detected when ANY rank finishes"""
+
+        with patch("torch.distributed.is_initialized", return_value=True):
+            with patch("torch.distributed.all_reduce") as mock_all_reduce:
+
+                def all_reduce_side_effect(tensor, op, async_op=False):
+                    """Simulate all_reduce MAX operation across ranks
+                    Rank 0: epoch_increment = 0 (still in epoch 0)
+                    Rank 1: epoch_increment = 1 (moved to epoch 1)
+                    MAX = 1, so all ranks should stop
+                    """
+                    # Simulate MAX operation - set tensor to max value
+                    tensor[0] = 1  # At least one rank has epoch_increment=1
+
+                    if async_op:
+                        mock_work = Mock()
+                        mock_work.wait = Mock()
+                        return mock_work
+                    return None
+
+                mock_all_reduce.side_effect = all_reduce_side_effect
+
+                trainer = MockTrainer(eval_steps=0)
+
+                # Simulate rank 1's perspective: it moved to epoch 1
+                starting_epoch = 0
+                next_epoch = 1
+                epoch_increment = next_epoch - starting_epoch  # = 1
+
+                epoch_tensor = torch.tensor([epoch_increment], dtype=torch.long)
+
+                # Start async all_reduce
+                work = torch.distributed.all_reduce(
+                    epoch_tensor, op=torch.distributed.ReduceOp.MAX, async_op=True
+                )
+
+                # Wait for result
+                work.wait()
+
+                # Check if should break (any rank has increment > 0)
+                should_break = epoch_tensor.item() > 0
+
+                assert should_break is True
+                assert epoch_tensor.item() == 1
+
+
+class TestEvaluationIntegration:
+    """Integration-style tests for the full evaluation flow"""
+
+    @pytest.mark.asyncio
+    async def test_prefetch_pattern_ordering(self):
+        """Test that the prefetch pattern processes batches in correct order"""
+        trainer = MockTrainer(eval_steps=0)
+
+        # Create identifiable batches
+        batches = [
+            {
+                "id": 0,
+                "metrics": [MockMetric("num_epochs", 0)],
+                "labels": torch.zeros(1),
+            },
+            {
+                "id": 1,
+                "metrics": [MockMetric("num_epochs", 0)],
+                "labels": torch.zeros(1),
+            },
+            {
+                "id": 2,
+                "metrics": [MockMetric("num_epochs", 0)],
+                "labels": torch.zeros(1),
+            },
+            {
+                "id": 3,
+                "metrics": [MockMetric("num_epochs", 1)],
+                "labels": torch.zeros(1),
+            },
+        ]
+
+        dataloader = iter(batches)
+        processed_ids = []
+
+        # Prefetch first batch
+        next_batch = next(dataloader)
+        next_should_break = False
+        starting_epoch = None
+
+        while True:
+            if next_should_break:
+                break
+
+            # Process current batch
+            batch = next_batch
+            processed_ids.append(batch["id"])
+
+            # Extract epoch
+            current_epoch = trainer._extract_epoch_from_batch(batch)
+            if current_epoch is not None and starting_epoch is None:
+                starting_epoch = current_epoch
+
+            # Prefetch next
+            try:
+                next_batch = next(dataloader)
+                next_epoch = trainer._extract_epoch_from_batch(next_batch)
+
+                if next_epoch is not None and starting_epoch is not None:
+                    epoch_increment = next_epoch - starting_epoch
+                    next_should_break = epoch_increment > 0
+            except StopIteration:
+                next_should_break = True
+
+        # Should have processed batches 0, 1, 2 (stopped when detected batch 3 has epoch 1)
+        assert processed_ids == [0, 1, 2]
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])

From 4793948f15fad070cde9202e75b955f5f64cdb4a Mon Sep 17 00:00:00 2001
From: Hossein Kavianihamedani <hosseinkh@fb.com>
Date: Fri, 17 Oct 2025 14:08:49 -0700
Subject: [PATCH 5/7] Add configurable datasets and validation and shortening
 the code

---
 apps/sft/llama3_8b.yaml |  18 ++++-
 apps/sft/main.py        | 147 ++++++++++++++++++----------------------
 apps/sft/qwen3_8b.yaml  |  15 +++-
 3 files changed, 94 insertions(+), 86 deletions(-)

diff --git a/apps/sft/llama3_8b.yaml b/apps/sft/llama3_8b.yaml
index 2fd563a6c..f24936670 100644
--- a/apps/sft/llama3_8b.yaml
+++ b/apps/sft/llama3_8b.yaml
@@ -26,15 +26,27 @@ optimizer:
 lr_scheduler:
   warmup_steps: 200
 
+dataset:
+  path: "yahma/alpaca-cleaned"
+  split: "train[:95%]"
+
+dataset_val:
+  path: "yahma/alpaca-cleaned"
+  split: "train[95%:]"
+
 training:
   local_batch_size: 1
   seq_len: 2048
   max_norm: 1.0
   steps: 1000
   compile: false
-  dataset: "c4"
-  #eval_interval: 500  # Setting eval_interval to run evaluation
-  #eval_steps: 100     # Number of validation batches during each evaluation run
+
+
+validation:
+  enabled: true       # Enable/disable validation
+  eval_interval: 100  # Run evaluation every 100 training steps
+  eval_steps: 50      # Number of batches per evaluation (0 = full epoch)
+
 
 parallelism:
   data_parallel_replicate_degree: 1
diff --git a/apps/sft/main.py b/apps/sft/main.py
index 7d6cfc665..c694867fb 100644
--- a/apps/sft/main.py
+++ b/apps/sft/main.py
@@ -79,9 +79,25 @@ def __init__(self, config: DictConfig):
         self._rank = current_rank().rank
         self._size = math.prod(current_size().values())
 
-        # Evaluation settings
-        self.eval_interval = job_config.training.get("eval_interval", float("inf"))
-        self.eval_steps = job_config.training.get("eval_steps", 0)
+        # Evaluation settings from validation config
+        validation_config = job_config.get("validation", {})
+        self.validation_enabled = validation_config.get("enabled", False)
+
+        if self.validation_enabled:
+            self.eval_interval = validation_config.get("eval_interval")
+            self.eval_steps = validation_config.get("eval_steps")
+
+            if self.eval_interval is None:
+                raise ValueError(
+                    "validation.eval_interval is required when validation.enabled is true"
+                )
+            if self.eval_steps is None:
+                raise ValueError(
+                    "validation.eval_steps is required when validation.enabled is true"
+                )
+        else:
+            self.eval_interval = None
+            self.eval_steps = None
 
         self._init_dist()
         super().__init__(job_config)
@@ -113,23 +129,30 @@ def _init_dist(self):
 
     @endpoint
     async def setup(self):
-        # Setup training data (first 90% of train split)
+        # Setup training data from config
+        dataset_config = self.job_config.get("dataset")
+
         self.train_dataloader = self.setup_data(
-            dataset_path="yahma/alpaca-cleaned", dataset_split="train[:90%]"
+            dataset_path=dataset_config.get("path"),
+            dataset_split=dataset_config.get("split"),
         )
 
-        # Setup validation data (last 10% of train split)
+        # Setup validation data from config
+        dataset_val_config = self.job_config.get("dataset_val", {})
         self.val_dataloader = self.setup_data(
-            dataset_path="yahma/alpaca-cleaned", dataset_split="train[90%:]"
+            dataset_path=dataset_val_config.get("path", dataset_config.get("path")),
+            dataset_split=dataset_val_config.get("split", dataset_config.get("split")),
         )
 
         # Load checkpoint if resuming
         self.checkpointer.load(step=self.current_step)
 
-    def setup_data(
-        self, dataset_path: str = "yahma/alpaca-cleaned", dataset_split: str = "train"
-    ):
+    def setup_data(self, dataset_path: str, dataset_split: str):
         """Setup data with configurable dataset path and split."""
+        if not dataset_path or not dataset_split:
+            raise ValueError(
+                f"dataset.path and dataset.split are required in YAML config. Got path={dataset_path}, split={dataset_split}"
+            )
         print(os.path.join(self.job_config.model.hf_assets_path, "tokenizer.json"))
         tokenizer = HuggingFaceModelTokenizer(
             tokenizer_json_path=os.path.join(
@@ -281,39 +304,26 @@ def train_step(self, batch) -> None:
 
     def _extract_epoch_from_batch(self, batch: dict) -> int | None:
         """Extract epoch number from batch metrics."""
-        if "metrics" not in batch:
-            return None
-
-        for metric in batch["metrics"]:
-            if hasattr(metric, "metric_name") and metric.metric_name == "num_epochs":
-                return metric.value
+        if "metrics" in batch:
+            for metric in batch["metrics"]:
+                if (
+                    hasattr(metric, "metric_name")
+                    and metric.metric_name == "num_epochs"
+                ):
+                    return metric.value
         return None
 
     async def evaluate(self) -> dict[str, float]:
-        """Run evaluation on validation set for one complete epoch.
-
-        Uses prefetch + non-blocking all_reduce pattern to detect epoch completion
-        across all ranks without blocking on every batch.
-
-        Pattern:
-        - Iteration N: Start async all_reduce on next batch's epoch (non-blocking)
-        - Process current batch while all_reduce completes in background
-        - Iteration N+1: Check result from previous all_reduce (should be done)
-
-        This overlaps communication with computation for better performance.
-        """
+        """Run evaluation with async all_reduce for cross-rank epoch synchronization."""
         logger.info("=" * 50)
-        logger.info("STARTING EVALUATION ")
+        logger.info("STARTING EVALUATION")
         logger.info("=" * 50)
 
-        # Set model to eval mode
         for model_part in self.model_parts:
             model_part.eval()
 
         val_dataloader = iter(self.val_dataloader)
-        total_loss = 0.0
-        num_batches = 0
-        starting_epoch = None
+        total_loss, num_batches, starting_epoch = 0.0, 0, None
 
         # Prefetch first batch
         try:
@@ -322,106 +332,79 @@ async def evaluate(self) -> dict[str, float]:
             logger.warning("Validation dataloader is empty")
             return {"val_loss": 0.0, "val_batches": 0}
 
-        next_should_break = False
-        pending_work = None  # Handle for async all_reduce
-        epoch_tensor = None  # Tensor for all_reduce result
+        should_break, pending_work, epoch_tensor = False, None, None
 
         with torch.no_grad():
             while True:
-                # Check result from PREVIOUS iteration's async all_reduce
+                # Wait for previous async all_reduce to complete
                 if pending_work is not None:
-                    pending_work.wait()  # Should be complete (or very fast) since we did compute
-                    if epoch_tensor is not None:
-                        next_should_break = epoch_tensor.item() > 0
+                    pending_work.wait()
+                    should_break = (
+                        epoch_tensor.item() > 0 if epoch_tensor is not None else False
+                    )
                     pending_work = None
 
-                # Check if we should break (based on previous iteration's check)
-                if next_should_break:
+                if should_break:
                     logger.info(
                         "Epoch completed across all ranks - stopping evaluation"
                     )
                     break
 
-                # Check optional cap on eval steps
                 if self.eval_steps > 0 and num_batches >= self.eval_steps:
                     logger.info(f"Reached eval_steps cap of {self.eval_steps}")
                     break
 
-                # Use the batch that was prefetched in previous iteration
                 batch = next_batch
 
-                # Extract epoch from current batch
+                # Track starting epoch
                 current_epoch = self._extract_epoch_from_batch(batch)
                 if current_epoch is not None and starting_epoch is None:
                     starting_epoch = current_epoch
-                    logger.info(f"Starting evaluation at epoch {starting_epoch}")
 
-                # Prefetch next batch and start async all_reduce
+                # Prefetch next batch and start async epoch check
                 try:
                     next_batch = next(val_dataloader)
-
-                    # Extract epoch from next batch
                     next_epoch = self._extract_epoch_from_batch(next_batch)
 
-                    # Start NON-BLOCKING all_reduce to check if any rank completed epoch
                     if next_epoch is not None and starting_epoch is not None:
-                        # Check if next batch indicates epoch completion
                         epoch_increment = next_epoch - starting_epoch
-
                         if torch.distributed.is_initialized():
-                            # Create tensor for all_reduce
                             epoch_tensor = torch.tensor(
                                 [epoch_increment], dtype=torch.long, device=self.device
                             )
-                            # Start async all_reduce (returns immediately, doesn't block)
                             pending_work = torch.distributed.all_reduce(
                                 epoch_tensor,
                                 op=torch.distributed.ReduceOp.MAX,
-                                async_op=True,  # NON-BLOCKING - returns immediately
+                                async_op=True,
                             )
                         else:
-                            # Single rank case - just check locally
-                            next_should_break = epoch_increment > 0
-
+                            should_break = epoch_increment > 0
                 except StopIteration:
-                    # No more batches - this is the last one
-                    next_should_break = True
+                    should_break = True
 
-                # Process current batch (while all_reduce completes in background)
-                # Move tensors to device
+                # Process current batch (overlaps with async all_reduce)
                 for k, v in batch.items():
                     if isinstance(v, torch.Tensor):
                         batch[k] = v.to(self.device)
 
                 labels = batch.pop("labels")
                 loss = self.forward_only(batch, labels)
-                # GPU compute happens here while network does all_reduce
-
                 total_loss += loss.item()
                 num_batches += 1
 
-                eval_steps_info = f"/{self.eval_steps}" if self.eval_steps > 0 else ""
-                logger.info(
-                    f"  Eval batch {num_batches}{eval_steps_info} | Loss: {loss.item():.4f}"
-                )
+                if num_batches % 10 == 0:
+                    logger.info(f"  Eval batch {num_batches} | Loss: {loss.item():.4f}")
 
-        # Set model back to train mode
         for model_part in self.model_parts:
             model_part.train()
 
         avg_loss = total_loss / max(num_batches, 1)
-
-        metrics = {
-            "val_loss": avg_loss,
-            "val_batches": num_batches,
-        }
-
-        logger.info("-" * 50)
-        logger.info(f"EVALUATION COMPLETE")
-        logger.info(f"Validation Loss: {avg_loss:.4f}")
-        logger.info(f"Batches Evaluated: {num_batches}")
+        logger.info(
+            f"EVALUATION COMPLETE | Val Loss: {avg_loss:.4f} | Batches: {num_batches}"
+        )
         logger.info("=" * 50)
-        return metrics
+
+        return {"val_loss": avg_loss, "val_batches": num_batches}
 
     @endpoint
     async def train(self) -> None:
@@ -439,8 +422,8 @@ async def train(self) -> None:
             self.train_step(batch)
             self.current_step += 1
 
-            # Run evaluation periodically
-            if self.current_step % self.eval_interval == 0:
+            # Run evaluation periodically if enabled
+            if self.validation_enabled and self.current_step % self.eval_interval == 0:
                 eval_metrics = await self.evaluate()
                 logger.info(f"Step {self.current_step} | Eval metrics: {eval_metrics}")
 
diff --git a/apps/sft/qwen3_8b.yaml b/apps/sft/qwen3_8b.yaml
index 2ab88bbd3..2d4128065 100644
--- a/apps/sft/qwen3_8b.yaml
+++ b/apps/sft/qwen3_8b.yaml
@@ -25,13 +25,26 @@ optimizer:
 lr_scheduler:
   warmup_steps: 200
 
+# Dataset configuration
+dataset:
+  path: "yahma/alpaca-cleaned"
+  split: "train[:95%]"
+
+dataset_val:
+  path: "yahma/alpaca-cleaned"
+  split: "train[95%:]"
+
 training:
   local_batch_size: 1
   seq_len: 2048
   max_norm: 1.0
   steps: 1000
   compile: false
-  dataset: "c4"
+
+validation:
+  enabled: true       # Enable/disable validation
+  eval_interval: 100  # Run evaluation every 100 training steps
+  eval_steps: 50      # Number of batches per evaluation (0 = full epoch)
 
 parallelism:
   data_parallel_replicate_degree: 1

From 676db88bc2e51d5b1d9355bf44c0e74ccba79dd5 Mon Sep 17 00:00:00 2001
From: Hossein Kavianihamedani <hosseinkh@fb.com>
Date: Fri, 17 Oct 2025 14:09:05 -0700
Subject: [PATCH 6/7] Add configurable datasets and validation and shortening
 the code

---
 apps/sft/llama3_8b_test_eval.yaml | 65 +++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 apps/sft/llama3_8b_test_eval.yaml

diff --git a/apps/sft/llama3_8b_test_eval.yaml b/apps/sft/llama3_8b_test_eval.yaml
new file mode 100644
index 000000000..65abf164f
--- /dev/null
+++ b/apps/sft/llama3_8b_test_eval.yaml
@@ -0,0 +1,65 @@
+# Test configuration to verify evaluation is working
+# Runs very few steps with frequent evaluation
+
+comm:
+  trace_buf_size: 0
+
+model:
+  name: llama3
+  flavor: 8B
+  hf_assets_path: /home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct
+
+processes:
+  procs: 8  # Just 2 processes for faster testing
+  with_gpus: true
+
+optimizer:
+  name: AdamW
+  lr: 1e-5
+  eps: 1e-8
+
+
+lr_scheduler:
+  warmup_steps: 2
+
+dataset:
+  path: "yahma/alpaca-cleaned"
+  split: "train[:95%]"
+
+dataset_val:
+  path: "yahma/alpaca-cleaned"
+  split: "train[95%:]"
+
+training:
+  local_batch_size: 4
+  seq_len: 512  # Shorter sequences for speed
+  max_norm: 1.0
+  steps: 100  # Only 10 training steps total
+  compile: false
+
+validation:
+  enabled: true       # Enable/disable validation
+  eval_interval: 100  # Run evaluation every 100 training steps
+  eval_steps: 50      # Number of batches per evaluation (0 = full epoch)
+
+parallelism:
+  data_parallel_replicate_degree: 1
+  data_parallel_shard_degree: -1
+  tensor_parallel_degree: 2
+  pipeline_parallel_degree: 1
+  context_parallel_degree: 1
+  expert_parallel_degree: 1
+  disable_loss_parallel: false
+
+checkpoint:
+  enable: true
+  folder: /home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct/test_eval_checkpoints
+  initial_load_path: /home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct/
+  initial_load_in_hf: true
+  last_save_in_hf: true
+  interval: 100  # Don't save frequently during test
+  async_mode: disabled
+
+activation_checkpoint:
+  mode: selective
+  selective_ac_option: op

From 250c0cd28276edc27b3d6ee228dda1f80f4d1ca8 Mon Sep 17 00:00:00 2001
From: Hossein Kavianihamedani <hosseinkh@fb.com>
Date: Fri, 17 Oct 2025 14:23:00 -0700
Subject: [PATCH 7/7] Removed llama test eval

---
 apps/sft/llama3_8b_test_eval.yaml | 65 -------------------------------
 1 file changed, 65 deletions(-)
 delete mode 100644 apps/sft/llama3_8b_test_eval.yaml

diff --git a/apps/sft/llama3_8b_test_eval.yaml b/apps/sft/llama3_8b_test_eval.yaml
deleted file mode 100644
index 65abf164f..000000000
--- a/apps/sft/llama3_8b_test_eval.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-# Test configuration to verify evaluation is working
-# Runs very few steps with frequent evaluation
-
-comm:
-  trace_buf_size: 0
-
-model:
-  name: llama3
-  flavor: 8B
-  hf_assets_path: /home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct
-
-processes:
-  procs: 8  # Just 2 processes for faster testing
-  with_gpus: true
-
-optimizer:
-  name: AdamW
-  lr: 1e-5
-  eps: 1e-8
-
-
-lr_scheduler:
-  warmup_steps: 2
-
-dataset:
-  path: "yahma/alpaca-cleaned"
-  split: "train[:95%]"
-
-dataset_val:
-  path: "yahma/alpaca-cleaned"
-  split: "train[95%:]"
-
-training:
-  local_batch_size: 4
-  seq_len: 512  # Shorter sequences for speed
-  max_norm: 1.0
-  steps: 100  # Only 10 training steps total
-  compile: false
-
-validation:
-  enabled: true       # Enable/disable validation
-  eval_interval: 100  # Run evaluation every 100 training steps
-  eval_steps: 50      # Number of batches per evaluation (0 = full epoch)
-
-parallelism:
-  data_parallel_replicate_degree: 1
-  data_parallel_shard_degree: -1
-  tensor_parallel_degree: 2
-  pipeline_parallel_degree: 1
-  context_parallel_degree: 1
-  expert_parallel_degree: 1
-  disable_loss_parallel: false
-
-checkpoint:
-  enable: true
-  folder: /home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct/test_eval_checkpoints
-  initial_load_path: /home/hosseinkh/models/Meta-Llama-3.1-8B-Instruct/
-  initial_load_in_hf: true
-  last_save_in_hf: true
-  interval: 100  # Don't save frequently during test
-  async_mode: disabled
-
-activation_checkpoint:
-  mode: selective
-  selective_ac_option: op