nvidia-cosmos · sauravn-hub · Nov 7, 2025
diff --git a/...s/post_training/reason1/physical-plausibility-check/assets/custom_dataset_sft_config.toml b/...s/post_training/reason1/physical-plausibility-check/assets/custom_dataset_sft_config.toml
@@ -0,0 +1,64 @@
+# Training Configuration for Custom Dataset (Transfer1)
+# Dataset: Custom labeled videos with physical plausibility scores
+# Model: Cosmos-Reason1-7B
+# 
+# This configuration is optimized for 8 GPUs. Adjust dp_shard_size based on your setup:
+# - 2 GPUs: dp_shard_size = 2
+# - 4 GPUs: dp_shard_size = 4
+# - 8 GPUs: dp_shard_size = 8
+
+[custom.dataset]
+# Path to training dataset with conversation format
+path = "data/transfer1_split_with_conv/train"
+
+[train]
+# Training epochs and output configuration
+epoch = 10
+output_dir = "outputs/transfer1_sft"
+compile = false
+train_batch_per_replica = 32
+
+# Evaluation configuration
+eval_steps = 50
+evaluation_strategy = "steps"
+save_strategy = "steps"
+load_best_model_at_end = true
+metric_for_best_model = "eval_loss"
+
+[policy]
+# Model configuration
+model_name_or_path = "nvidia/Cosmos-Reason1-7B"
+model_max_length = 4096
+
+[logging]
+# Logging configuration
+logger = ['console', 'tensorboard']
+project_name = "cosmos_reason1"
+experiment_name = "post_training_hf/transfer1_sft"
+
+[train.train_policy]
+# Training policy configuration
+type = "sft"
+conversation_column_name = "conversations"
+mini_batch = 4
+
+[train.eval_policy]
+# Evaluation dataset configuration
+dataset.name = "data/transfer1_split_with_conv/eval"
+
+[train.ckpt]
+# Checkpoint configuration
+enable_checkpoint = true
+save_freq = 50
+max_keep = 5
+save_mode = "async"
+
+[policy.parallelism]
+# Parallelism configuration
+tp_size = 1
+cp_size = 1
+dp_shard_size = 8
+pp_size = 1
+dp_replicate_size = 1
+
+
diff --git a/docs/recipes/post_training/reason1/physical-plausibility-check/post_training.md b/docs/recipes/post_training/reason1/physical-plausibility-check/post_training.md
@@ -372,12 +372,208 @@ Realistically, dough should stretch and fold in certain ways when rolled or shap
 - **Model prediction**: 2. (The prediction matches the ground truth.)
 - **Summary of the model output**: The analysis has successfully identified the key issues in the video, including unnatural deformation, inconsistent texture, gravity-defying movement, abrupt motion changes, and unrealistic food preparation behavior.
 
+## Fine-Tuning on Custom Datasets
+
+Having demonstrated fine-tuning on the public VideoPhy-2 dataset, we now show how to adapt this methodology to custom datasets. This section uses videos generated by Cosmos Transfer 2.5 with human-labeled physical plausibility scores.
+
+### Dataset Preparation
+
+The custom dataset workflow supports local video files with human-annotated quality scores. The dataset preparation involves:
+
+1. **Data Organization**: Videos and associated metadata (prompts, labels)
+2. **Train/Eval Split**: Stratified splitting to maintain label distribution
+3. **Conversation Format**: Converting to the format required for SFT training
+
+### Step 1: Create Train/Eval Split
+
+The first step creates a stratified train/eval split from local videos with labels. Copy the script from the cookbook:
+
+```bash
+# In cosmos-reason1 root directory
+cp /path/to/cosmos-cookbook/scripts/examples/reason1/physical-plausibility-check/create_dataset_with_split.py \
+   examples/post_training_hf/scripts/
+```
+
+Prepare your data directory structure:
+
+```
+data/
+├── transfer1_generated_videos/  # Video files (.mp4)
+├── prompts/                      # Prompt text files (.txt)
+└── transfer25_human_labeled.xlsx # Labels spreadsheet
+```
+
+**Example prompt file** (`prompts/video_001_prompt.txt`):
+
+```
+A person waves hello to another person approaching from the left
+```
+
+**Example labels spreadsheet** (`transfer25_human_labeled.xlsx`):
+
+| output_link | Action alignment | Physical common sense | Quality |
+|-------------|------------------|----------------------|---------|
+| https://example.com/videos/video_001.mp4 | 5 | 5 | 5 |
+| https://example.com/videos/video_002.mp4 | 4 | 3 | 4 |
+| https://example.com/videos/video_003.mp4 | 2 | 1 | 2 |
+
+The script expects:
+- **output_link**: Video URL or path (used to match video files)
+- **Physical common sense**: Score 0-1 or 1-5 (use `--scale_labels` to convert 0-1 to 1-5)
+
+**Note**: If your video URLs don't match the filename pattern, customize the `extract_filename_from_url()` function in `create_dataset_with_split.py`. The script includes examples for simple and complex URL patterns.
+
+Run the script to create train/eval split:
+
+```bash
+cd examples/post_training_hf/
+
+uv run scripts/create_dataset_with_split.py \
+    --output_dir data/transfer1_split \
+    --data_dir data \
+    --excel_file transfer25_human_labeled.xlsx \
+    --eval_size 0.1 \
+    --balance_labels \
+    --scale_labels
+```
+
+**Key Options:**
+
+- `--eval_size 0.1`: 10% of data for evaluation
+- `--balance_labels`: Balance label distribution across classes
+- `--scale_labels`: Map binary labels (0,1) to 1-5 scale
+- `--random_seed 42`: Reproducible splitting
+
+### Step 2: Add Conversation Format
+
+The second step converts the dataset to conversation format required for training. Copy the script:
+
+```bash
+cp /path/to/cosmos-cookbook/scripts/examples/reason1/physical-plausibility-check/add_conversations_to_dataset.py \
+   examples/post_training_hf/scripts/
+```
+
+Convert both train and eval splits:
+
+```bash
+# Process train split
+uv run scripts/add_conversations_to_dataset.py \
+    --input_dir data/transfer1_split/train \
+    --output_dir data/transfer1_split_with_conv/train \
+    --prompt_path prompts/video_reward.yaml
+
+# Process eval split
+uv run scripts/add_conversations_to_dataset.py \
+    --input_dir data/transfer1_split/eval \
+    --output_dir data/transfer1_split_with_conv/eval \
+    --prompt_path prompts/video_reward.yaml
+```
+
+### Step 3: Configure Training
+
+Copy the training configuration from the cookbook:
+
+```bash
+cp /path/to/cosmos-cookbook/docs/recipes/post_training/reason1/physical-plausibility-check/assets/custom_dataset_sft_config.toml \
+   examples/post_training_hf/configs/transfer1_sft.toml
+```
+
+The training uses the existing `scripts/custom_sft.py` script already available in the cosmos-reason1 repository.
+
+**Key Configuration Parameters** (from `configs/transfer1_sft.toml`):
+
+- `custom.dataset.path`: Path to training dataset (`"data/transfer1_split_with_conv/train"`)
+- `train.epoch`: Number of training epochs (10)
+- `train.eval_steps`: Evaluate every 50 steps
+- `train.output_dir`: Output directory for checkpoints (`"outputs/transfer1_sft"`)
+- `policy.model_name_or_path`: Base model (`"nvidia/Cosmos-Reason1-7B"`)
+- `policy.parallelism.dp_shard_size`: Data parallel sharding - adjust based on GPUs (2, 4, or 8)
+- `train.ckpt.save_freq`: Save checkpoint every 50 steps
+- `train.ckpt.max_keep`: Keep 5 best checkpoints
+
+### Step 4: Run Training
+
+Start the fine-tuning process:
+
+```bash
+cd examples/post_training_hf/
+cosmos-rl --config configs/transfer1_sft.toml scripts/custom_sft.py
+```
+
+Training outputs are saved to `outputs/transfer1_sft/[timestamp]/`:
+
+- `safetensors/step_*/`: Model checkpoints
+- `tensorboard/`: Training metrics
+
+Monitor training progress with TensorBoard:
+
+```bash
+tensorboard --logdir outputs/transfer1_sft/
+```
+
+### Step 5: Evaluate Fine-Tuned Model
+
+After training, evaluate the model on the evaluation dataset. Copy the evaluation script:
+
+```bash
+cp /path/to/cosmos-cookbook/scripts/examples/reason1/physical-plausibility-check/evaluate_model.py \
+   examples/post_training_hf/scripts/
+```
+
+Run evaluation:
+
+```bash
+uv run scripts/evaluate_model.py \
+    --model_path outputs/transfer1_sft/[timestamp]/safetensors/step_80 \
+    --eval_dataset data/transfer1_split_with_conv/eval \
+    --prompt_path prompts/video_reward.yaml \
+    --output_dir eval_results
+```
+
+The evaluation generates:
+
+- `evaluation_results.json`: Detailed metrics
+- `evaluation_report.html`: Interactive HTML report
+
+**Evaluation Metrics:**
+
+- **Exact Accuracy**: Percentage of exact score matches
+- **Within ±1 Accuracy**: Predictions within 1 point of ground truth
+- **Mean Absolute Error**: Average prediction error
+- **Binary Classification**: Precision, recall, F1 for good vs bad videos
+
+### Results and Analysis
+
+The fine-tuned model shows improved performance on custom datasets. The evaluation report provides:
+
+- Overall accuracy metrics
+- Confusion matrix showing prediction patterns
+- Per-sample results with model responses
+- Binary classification metrics for quality filtering
+
+This workflow can be adapted to other video quality assessment tasks by:
+
+1. Organizing videos and labels in the specified format
+2. Adjusting the prompt template for your specific task
+3. Modifying the label scaling if using different score ranges
+
 ## Conclusion
 
-Fine-tuning Cosmos Reason 1 on VideoPhy-2 data significantly improves physical plausibility prediction, progressing from zero-shot (0.293 correlation) to SFT (0.395) and RL (0.425). Key insights:
+This case study demonstrates the full spectrum of fine-tuning Cosmos Reason 1 for physical plausibility prediction:
+
+- **Zero-shot Performance**: The base model shows strong understanding of physical laws without fine-tuning
+- **Supervised Fine-Tuning**: Training on VideoPhy-2 improves correlation from 0.293 to 0.395
+- **Reinforcement Learning**: Further enhancement to 0.425 correlation with better reasoning traces
+- **Custom Dataset Adaptation**: Complete workflow for fine-tuning on domain-specific datasets
+
+Key insights:
 
 - **Progressive improvement**: Each training stage (SFT, RL) delivers measurable gains in both accuracy and correlation, with RL achieving the best overall performance.
 - **Thinking traces enhance intepretability**: RL training with structured prompts enables the model to generate detailed reasoning traces that explain its predictions.
-- **Flexibility**: This methodology can be adapted to other video quality assessment tasks by substituting the datasets and defining appropriate metrics.
+- **Flexibility**: The methodology can be adapted to custom datasets and other video quality assessment tasks by following the dataset preparation workflow and adjusting prompts and metrics.
+
+The custom dataset workflow enables practitioners to:
 
-As a next step, we can investigate reasoning SFT as a warmup step using datasets that contain thinking traces. This can improve the model's reasoning ability before RL training.
+1. Leverage videos from Cosmos Transfer or other sources
+2. Apply human labeling for domain-specific quality criteria
+3. Fine-tune models for specialized use cases in video generation quality control
diff --git a/scripts/examples/reason1/physical-plausibility-check/add_conversations_to_dataset.py b/scripts/examples/reason1/physical-plausibility-check/add_conversations_to_dataset.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+"""Add conversation format to existing datasets.
+
+This script converts datasets with caption/video_url/pc format
+to the conversation format required for training.
+"""
+
+import argparse
+import json
+from pathlib import Path
+
+import datasets
+import yaml
+from cosmos_reason1_utils.text import PromptConfig, create_conversation
+from tqdm import tqdm
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--input_dir", type=str, required=True,
+        help="Input dataset directory"
+    )
+    parser.add_argument(
+        "--output_dir", type=str, required=True,
+        help="Output dataset directory"
+    )
+    parser.add_argument(
+        "--prompt_path", type=str, required=True,
+        help="Path to prompt YAML file"
+    )
+    args = parser.parse_args()
+
+    # Load prompt template
+    print(f"📝 Loading prompt from: {args.prompt_path}")
+    with open(args.prompt_path, 'r') as f:
+        prompt_config = PromptConfig.model_validate(yaml.safe_load(f))
+
+    system_prompt = prompt_config.system_prompt
+    user_prompt = prompt_config.user_prompt
+
+    # Load existing dataset
+    print(f"📂 Loading dataset from: {args.input_dir}")
+    dataset = datasets.load_from_disk(args.input_dir)
+    print(f"✅ Loaded {len(dataset)} samples")
+    print(f"Current features: {list(dataset.features.keys())}")
+
+    # Convert to conversation format
+    print("\n🔄 Converting to conversation format...")
+    conversations = []
+
+    for sample in tqdm(dataset, desc="Processing samples"):
+        video_path = sample['video_url']
+        pc_score = sample['pc']
+
+        # Create conversation
+        conversation = create_conversation(
+            system_prompt=system_prompt,
+            user_prompt=user_prompt,
+            videos=[video_path],
+            response=f"<answer>\n{pc_score}\n</answer>",
+        )
+
+        conversations.append(json.dumps(conversation))
+
+    # Add conversations column to dataset
+    dataset = dataset.add_column("conversations", conversations)
+
+    print(f"\n✅ Added 'conversations' column")
+    print(f"New features: {list(dataset.features.keys())}")
+
+    # Save updated dataset
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"\n💾 Saving to: {output_dir}")
+    dataset.save_to_disk(str(output_dir))
+
+    print(f"\n✅ Dataset saved successfully!")
+    print(f"\nSample conversation:")
+    print(json.loads(dataset[0]['conversations']))
+
+
+if __name__ == "__main__":
+    main()
+
+