diff --git a/docs/recipes/post_training/reason1/physical-plausibility-check/assets/custom_dataset_sft_config.toml b/docs/recipes/post_training/reason1/physical-plausibility-check/assets/custom_dataset_sft_config.toml
new file mode 100644
index 0000000..63f2c9d
--- /dev/null
+++ b/docs/recipes/post_training/reason1/physical-plausibility-check/assets/custom_dataset_sft_config.toml
@@ -0,0 +1,64 @@
+# Training Configuration for Custom Dataset (Transfer1)
+# Dataset: Custom labeled videos with physical plausibility scores
+# Model: Cosmos-Reason1-7B
+# 
+# This configuration is optimized for 8 GPUs. Adjust dp_shard_size based on your setup:
+# - 2 GPUs: dp_shard_size = 2
+# - 4 GPUs: dp_shard_size = 4
+# - 8 GPUs: dp_shard_size = 8
+
+[custom.dataset]
+# Path to training dataset with conversation format
+path = "data/transfer1_split_with_conv/train"
+
+[train]
+# Training epochs and output configuration
+epoch = 10
+output_dir = "outputs/transfer1_sft"
+compile = false
+train_batch_per_replica = 32
+
+# Evaluation configuration
+eval_steps = 50
+evaluation_strategy = "steps"
+save_strategy = "steps"
+load_best_model_at_end = true
+metric_for_best_model = "eval_loss"
+
+[policy]
+# Model configuration
+model_name_or_path = "nvidia/Cosmos-Reason1-7B"
+model_max_length = 4096
+
+[logging]
+# Logging configuration
+logger = ['console', 'tensorboard']
+project_name = "cosmos_reason1"
+experiment_name = "post_training_hf/transfer1_sft"
+
+[train.train_policy]
+# Training policy configuration
+type = "sft"
+conversation_column_name = "conversations"
+mini_batch = 4
+
+[train.eval_policy]
+# Evaluation dataset configuration
+dataset.name = "data/transfer1_split_with_conv/eval"
+
+[train.ckpt]
+# Checkpoint configuration
+enable_checkpoint = true
+save_freq = 50
+max_keep = 5
+save_mode = "async"
+
+[policy.parallelism]
+# Parallelism configuration
+tp_size = 1
+cp_size = 1
+dp_shard_size = 8
+pp_size = 1
+dp_replicate_size = 1
+
+
diff --git a/docs/recipes/post_training/reason1/physical-plausibility-check/post_training.md b/docs/recipes/post_training/reason1/physical-plausibility-check/post_training.md
index c694ba3..21cb1a2 100644
--- a/docs/recipes/post_training/reason1/physical-plausibility-check/post_training.md
+++ b/docs/recipes/post_training/reason1/physical-plausibility-check/post_training.md
@@ -372,12 +372,208 @@ Realistically, dough should stretch and fold in certain ways when rolled or shap
 - **Model prediction**: 2. (The prediction matches the ground truth.)
 - **Summary of the model output**: The analysis has successfully identified the key issues in the video, including unnatural deformation, inconsistent texture, gravity-defying movement, abrupt motion changes, and unrealistic food preparation behavior.
 
+## Fine-Tuning on Custom Datasets
+
+Having demonstrated fine-tuning on the public VideoPhy-2 dataset, we now show how to adapt this methodology to custom datasets. This section uses videos generated by Cosmos Transfer 2.5 with human-labeled physical plausibility scores.
+
+### Dataset Preparation
+
+The custom dataset workflow supports local video files with human-annotated quality scores. The dataset preparation involves:
+
+1. **Data Organization**: Videos and associated metadata (prompts, labels)
+2. **Train/Eval Split**: Stratified splitting to maintain label distribution
+3. **Conversation Format**: Converting to the format required for SFT training
+
+### Step 1: Create Train/Eval Split
+
+The first step creates a stratified train/eval split from local videos with labels. Copy the script from the cookbook:
+
+```bash
+# In cosmos-reason1 root directory
+cp /path/to/cosmos-cookbook/scripts/examples/reason1/physical-plausibility-check/create_dataset_with_split.py \
+   examples/post_training_hf/scripts/
+```
+
+Prepare your data directory structure:
+
+```
+data/
+├── transfer1_generated_videos/  # Video files (.mp4)
+├── prompts/                      # Prompt text files (.txt)
+└── transfer25_human_labeled.xlsx # Labels spreadsheet
+```
+
+**Example prompt file** (`prompts/video_001_prompt.txt`):
+
+```
+A person waves hello to another person approaching from the left
+```
+
+**Example labels spreadsheet** (`transfer25_human_labeled.xlsx`):
+
+| output_link | Action alignment | Physical common sense | Quality |
+|-------------|------------------|----------------------|---------|
+| https://example.com/videos/video_001.mp4 | 5 | 5 | 5 |
+| https://example.com/videos/video_002.mp4 | 4 | 3 | 4 |
+| https://example.com/videos/video_003.mp4 | 2 | 1 | 2 |
+
+The script expects:
+- **output_link**: Video URL or path (used to match video files)
+- **Physical common sense**: Score 0-1 or 1-5 (use `--scale_labels` to convert 0-1 to 1-5)
+
+**Note**: If your video URLs don't match the filename pattern, customize the `extract_filename_from_url()` function in `create_dataset_with_split.py`. The script includes examples for simple and complex URL patterns.
+
+Run the script to create train/eval split:
+
+```bash
+cd examples/post_training_hf/
+
+uv run scripts/create_dataset_with_split.py \
+    --output_dir data/transfer1_split \
+    --data_dir data \
+    --excel_file transfer25_human_labeled.xlsx \
+    --eval_size 0.1 \
+    --balance_labels \
+    --scale_labels
+```
+
+**Key Options:**
+
+- `--eval_size 0.1`: 10% of data for evaluation
+- `--balance_labels`: Balance label distribution across classes
+- `--scale_labels`: Map binary labels (0,1) to 1-5 scale
+- `--random_seed 42`: Reproducible splitting
+
+### Step 2: Add Conversation Format
+
+The second step converts the dataset to conversation format required for training. Copy the script:
+
+```bash
+cp /path/to/cosmos-cookbook/scripts/examples/reason1/physical-plausibility-check/add_conversations_to_dataset.py \
+   examples/post_training_hf/scripts/
+```
+
+Convert both train and eval splits:
+
+```bash
+# Process train split
+uv run scripts/add_conversations_to_dataset.py \
+    --input_dir data/transfer1_split/train \
+    --output_dir data/transfer1_split_with_conv/train \
+    --prompt_path prompts/video_reward.yaml
+
+# Process eval split
+uv run scripts/add_conversations_to_dataset.py \
+    --input_dir data/transfer1_split/eval \
+    --output_dir data/transfer1_split_with_conv/eval \
+    --prompt_path prompts/video_reward.yaml
+```
+
+### Step 3: Configure Training
+
+Copy the training configuration from the cookbook:
+
+```bash
+cp /path/to/cosmos-cookbook/docs/recipes/post_training/reason1/physical-plausibility-check/assets/custom_dataset_sft_config.toml \
+   examples/post_training_hf/configs/transfer1_sft.toml
+```
+
+The training uses the existing `scripts/custom_sft.py` script already available in the cosmos-reason1 repository.
+
+**Key Configuration Parameters** (from `configs/transfer1_sft.toml`):
+
+- `custom.dataset.path`: Path to training dataset (`"data/transfer1_split_with_conv/train"`)
+- `train.epoch`: Number of training epochs (10)
+- `train.eval_steps`: Evaluate every 50 steps
+- `train.output_dir`: Output directory for checkpoints (`"outputs/transfer1_sft"`)
+- `policy.model_name_or_path`: Base model (`"nvidia/Cosmos-Reason1-7B"`)
+- `policy.parallelism.dp_shard_size`: Data parallel sharding - adjust based on GPUs (2, 4, or 8)
+- `train.ckpt.save_freq`: Save checkpoint every 50 steps
+- `train.ckpt.max_keep`: Keep 5 best checkpoints
+
+### Step 4: Run Training
+
+Start the fine-tuning process:
+
+```bash
+cd examples/post_training_hf/
+cosmos-rl --config configs/transfer1_sft.toml scripts/custom_sft.py
+```
+
+Training outputs are saved to `outputs/transfer1_sft/[timestamp]/`:
+
+- `safetensors/step_*/`: Model checkpoints
+- `tensorboard/`: Training metrics
+
+Monitor training progress with TensorBoard:
+
+```bash
+tensorboard --logdir outputs/transfer1_sft/
+```
+
+### Step 5: Evaluate Fine-Tuned Model
+
+After training, evaluate the model on the evaluation dataset. Copy the evaluation script:
+
+```bash
+cp /path/to/cosmos-cookbook/scripts/examples/reason1/physical-plausibility-check/evaluate_model.py \
+   examples/post_training_hf/scripts/
+```
+
+Run evaluation:
+
+```bash
+uv run scripts/evaluate_model.py \
+    --model_path outputs/transfer1_sft/[timestamp]/safetensors/step_80 \
+    --eval_dataset data/transfer1_split_with_conv/eval \
+    --prompt_path prompts/video_reward.yaml \
+    --output_dir eval_results
+```
+
+The evaluation generates:
+
+- `evaluation_results.json`: Detailed metrics
+- `evaluation_report.html`: Interactive HTML report
+
+**Evaluation Metrics:**
+
+- **Exact Accuracy**: Percentage of exact score matches
+- **Within ±1 Accuracy**: Predictions within 1 point of ground truth
+- **Mean Absolute Error**: Average prediction error
+- **Binary Classification**: Precision, recall, F1 for good vs bad videos
+
+### Results and Analysis
+
+The fine-tuned model shows improved performance on custom datasets. The evaluation report provides:
+
+- Overall accuracy metrics
+- Confusion matrix showing prediction patterns
+- Per-sample results with model responses
+- Binary classification metrics for quality filtering
+
+This workflow can be adapted to other video quality assessment tasks by:
+
+1. Organizing videos and labels in the specified format
+2. Adjusting the prompt template for your specific task
+3. Modifying the label scaling if using different score ranges
+
 ## Conclusion
 
-Fine-tuning Cosmos Reason 1 on VideoPhy-2 data significantly improves physical plausibility prediction, progressing from zero-shot (0.293 correlation) to SFT (0.395) and RL (0.425). Key insights:
+This case study demonstrates the full spectrum of fine-tuning Cosmos Reason 1 for physical plausibility prediction:
+
+- **Zero-shot Performance**: The base model shows strong understanding of physical laws without fine-tuning
+- **Supervised Fine-Tuning**: Training on VideoPhy-2 improves correlation from 0.293 to 0.395
+- **Reinforcement Learning**: Further enhancement to 0.425 correlation with better reasoning traces
+- **Custom Dataset Adaptation**: Complete workflow for fine-tuning on domain-specific datasets
+
+Key insights:
 
 - **Progressive improvement**: Each training stage (SFT, RL) delivers measurable gains in both accuracy and correlation, with RL achieving the best overall performance.
 - **Thinking traces enhance intepretability**: RL training with structured prompts enables the model to generate detailed reasoning traces that explain its predictions.
-- **Flexibility**: This methodology can be adapted to other video quality assessment tasks by substituting the datasets and defining appropriate metrics.
+- **Flexibility**: The methodology can be adapted to custom datasets and other video quality assessment tasks by following the dataset preparation workflow and adjusting prompts and metrics.
+
+The custom dataset workflow enables practitioners to:
 
-As a next step, we can investigate reasoning SFT as a warmup step using datasets that contain thinking traces. This can improve the model's reasoning ability before RL training.
+1. Leverage videos from Cosmos Transfer or other sources
+2. Apply human labeling for domain-specific quality criteria
+3. Fine-tune models for specialized use cases in video generation quality control
diff --git a/scripts/examples/reason1/physical-plausibility-check/add_conversations_to_dataset.py b/scripts/examples/reason1/physical-plausibility-check/add_conversations_to_dataset.py
new file mode 100644
index 0000000..3ba1e74
--- /dev/null
+++ b/scripts/examples/reason1/physical-plausibility-check/add_conversations_to_dataset.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+"""Add conversation format to existing datasets.
+
+This script converts datasets with caption/video_url/pc format
+to the conversation format required for training.
+"""
+
+import argparse
+import json
+from pathlib import Path
+
+import datasets
+import yaml
+from cosmos_reason1_utils.text import PromptConfig, create_conversation
+from tqdm import tqdm
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--input_dir", type=str, required=True,
+        help="Input dataset directory"
+    )
+    parser.add_argument(
+        "--output_dir", type=str, required=True,
+        help="Output dataset directory"
+    )
+    parser.add_argument(
+        "--prompt_path", type=str, required=True,
+        help="Path to prompt YAML file"
+    )
+    args = parser.parse_args()
+
+    # Load prompt template
+    print(f"📝 Loading prompt from: {args.prompt_path}")
+    with open(args.prompt_path, 'r') as f:
+        prompt_config = PromptConfig.model_validate(yaml.safe_load(f))
+    
+    system_prompt = prompt_config.system_prompt
+    user_prompt = prompt_config.user_prompt
+
+    # Load existing dataset
+    print(f"📂 Loading dataset from: {args.input_dir}")
+    dataset = datasets.load_from_disk(args.input_dir)
+    print(f"✅ Loaded {len(dataset)} samples")
+    print(f"Current features: {list(dataset.features.keys())}")
+
+    # Convert to conversation format
+    print("\n🔄 Converting to conversation format...")
+    conversations = []
+    
+    for sample in tqdm(dataset, desc="Processing samples"):
+        video_path = sample['video_url']
+        pc_score = sample['pc']
+        
+        # Create conversation
+        conversation = create_conversation(
+            system_prompt=system_prompt,
+            user_prompt=user_prompt,
+            videos=[video_path],
+            response=f"<answer>\n{pc_score}\n</answer>",
+        )
+        
+        conversations.append(json.dumps(conversation))
+    
+    # Add conversations column to dataset
+    dataset = dataset.add_column("conversations", conversations)
+    
+    print(f"\n✅ Added 'conversations' column")
+    print(f"New features: {list(dataset.features.keys())}")
+    
+    # Save updated dataset
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    print(f"\n💾 Saving to: {output_dir}")
+    dataset.save_to_disk(str(output_dir))
+    
+    print(f"\n✅ Dataset saved successfully!")
+    print(f"\nSample conversation:")
+    print(json.loads(dataset[0]['conversations']))
+
+
+if __name__ == "__main__":
+    main()
+
+
diff --git a/scripts/examples/reason1/physical-plausibility-check/create_dataset_with_split.py b/scripts/examples/reason1/physical-plausibility-check/create_dataset_with_split.py
new file mode 100644
index 0000000..f9fdb79
--- /dev/null
+++ b/scripts/examples/reason1/physical-plausibility-check/create_dataset_with_split.py
@@ -0,0 +1,372 @@
+#!/usr/bin/env python3
+#
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Create training and evaluation datasets from local video files.
+
+This script creates train/eval splits from local video files with human-labeled quality scores.
+Supports stratified splitting to maintain label distribution in both sets.
+"""
+
+import argparse
+import json
+import os
+import random
+import re
+import sys
+from collections import Counter
+from pathlib import Path
+from typing import Optional
+
+try:
+    import datasets
+    import pandas as pd
+    import yaml
+    from rich import print
+    from tqdm import tqdm
+except ImportError as e:
+    print(f"Error: Missing required package: {e}")
+    print("Please install dependencies:")
+    print("  pip install datasets pandas openpyxl pyyaml rich tqdm")
+    sys.exit(1)
+
+
+def extract_filename_from_url(url: str) -> Optional[str]:
+    """Extract local filename from URL.
+    
+    This function extracts the filename from video URLs. Customize this
+    function based on your URL structure. Examples:
+    
+    - Simple: Extract just the filename
+      url = "https://example.com/videos/video_001.mp4" -> "video_001.mp4"
+    
+    - Complex: Parse structured paths and reconstruct filenames
+      url = "https://example.com/action/wave/segment_01/video_001.mp4"
+      -> "wave_segment_01_video_001.mp4"
+    """
+    # Example 1: Simple filename extraction
+    # Uncomment and modify for your use case:
+    # return url.split('/')[-1]  # Returns last part of URL
+    
+    # Example 2: Custom pattern matching
+    # Modify this pattern to match your URL structure
+    pattern = r'([\w]+)/(com_\d+_\d+_[a-f0-9]+)_segment_(\d+)_left/gpu_(\d+)/video_(\d+)/output\.mp4'
+    match = re.search(pattern, url)
+    if match:
+        action = match.group(1)
+        timestamp_id = match.group(2)
+        segment = match.group(3)
+        gpu = match.group(4)
+        video = match.group(5)
+        filename = f'{action}_{timestamp_id}_segment_{segment}_left_gpu_{gpu}_video_{video}_output.mp4'
+        return filename
+    
+    # Fallback: Try extracting just the filename
+    if '/' in url:
+        return url.split('/')[-1]
+    
+    return None
+
+
+def balance_dataset_labels(dataset: datasets.Dataset, verbose: bool = True) -> datasets.Dataset:
+    """Balance dataset by resampling so each label appears the same number of times."""
+    random.seed(42)
+
+    # Extract PC labels and group samples by label
+    label_to_indices = {}
+    for i, sample in enumerate(dataset):
+        pc_score = sample.get("pc")
+        if pc_score is not None:
+            if pc_score not in label_to_indices:
+                label_to_indices[pc_score] = []
+            label_to_indices[pc_score].append(i)
+
+    if verbose:
+        print("\n📊 Original label distribution:")
+        for label in sorted(label_to_indices.keys()):
+            count = len(label_to_indices[label])
+            print(f"  Label {label}: {count} samples")
+
+    # target samples per label is the average number of samples per label
+    target_samples_per_label = len(dataset) // len(label_to_indices)
+
+    if verbose:
+        print(f"\n🎯 Target samples per label: {target_samples_per_label}")
+
+    # Resample each label to target count
+    balanced_indices = []
+    for label, indices in label_to_indices.items():
+        if len(indices) >= target_samples_per_label:
+            if verbose:
+                print(f"Downsampling label {label} from {len(indices)} to {target_samples_per_label}")
+            selected_indices = random.sample(indices, target_samples_per_label)
+        else:
+            if verbose:
+                print(f"Upsampling label {label} from {len(indices)} to {target_samples_per_label}")
+            selected_indices = random.choices(indices, k=target_samples_per_label)
+
+        balanced_indices.extend(selected_indices)
+
+    # Shuffle the balanced indices
+    random.shuffle(balanced_indices)
+
+    # Create new balanced dataset
+    balanced_data = [dataset[i] for i in balanced_indices]
+    balanced_dataset = datasets.Dataset.from_list(balanced_data)
+
+    if verbose:
+        print("\n📊 Final balanced label distribution:")
+        final_label_counts = Counter(sample["pc"] for sample in balanced_dataset)
+        for label in sorted(final_label_counts.keys()):
+            print(f"  Label {label}: {final_label_counts[label]} samples")
+        print(f"\n✅ Dataset balanced: {len(dataset)} → {len(balanced_dataset)} samples")
+
+    return balanced_dataset
+
+
+def stratified_split(dataset: datasets.Dataset, eval_size: float = 0.1, random_seed: int = 42):
+    """Split dataset into train/eval while maintaining label distribution."""
+    random.seed(random_seed)
+    
+    # Group indices by label
+    label_to_indices = {}
+    for i, sample in enumerate(dataset):
+        pc_score = sample.get("pc")
+        if pc_score is not None:
+            if pc_score not in label_to_indices:
+                label_to_indices[pc_score] = []
+            label_to_indices[pc_score].append(i)
+    
+    train_indices = []
+    eval_indices = []
+    
+    # Split each label proportionally
+    for label, indices in label_to_indices.items():
+        random.shuffle(indices)
+        split_point = int(len(indices) * (1 - eval_size))
+        train_indices.extend(indices[:split_point])
+        eval_indices.extend(indices[split_point:])
+    
+    # Shuffle to mix labels
+    random.shuffle(train_indices)
+    random.shuffle(eval_indices)
+    
+    # Create datasets
+    train_data = [dataset[i] for i in train_indices]
+    eval_data = [dataset[i] for i in eval_indices]
+    
+    train_dataset = datasets.Dataset.from_list(train_data)
+    eval_dataset = datasets.Dataset.from_list(eval_data)
+    
+    return train_dataset, eval_dataset
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument(
+        "--output_dir", type=str, help="Output directory for train/eval datasets.", required=True
+    )
+    parser.add_argument(
+        "--data_dir",
+        type=str,
+        default="data",
+        help="Path to data directory containing videos, prompts, and Excel file.",
+    )
+    parser.add_argument(
+        "--excel_file",
+        type=str,
+        default="transfer25_human_labeled.xlsx",
+        help="Excel file with video URLs and labels.",
+    )
+    parser.add_argument(
+        "--eval_size",
+        type=float,
+        default=0.1,
+        help="Fraction of data to use for evaluation (default: 0.1 = 10%%).",
+    )
+    parser.add_argument(
+        "--balance_labels",
+        action="store_true",
+        help="Balance dataset labels before splitting.",
+    )
+    parser.add_argument(
+        "--scale_labels",
+        action="store_true",
+        help="Map binary labels (0,1) to 1-5 scale: 0→1 (bad), 1→5 (good).",
+    )
+    parser.add_argument(
+        "--random_seed",
+        type=int,
+        default=42,
+        help="Random seed for reproducibility.",
+    )
+    args = parser.parse_args()
+
+    # Set random seed
+    random.seed(args.random_seed)
+
+    output_dir = Path(args.output_dir).resolve()
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Resolve paths
+    data_dir = Path(args.data_dir).resolve()
+    video_dir = data_dir / "transfer1_generated_videos"
+    prompt_dir = data_dir / "prompts"
+    excel_path = data_dir / args.excel_file
+
+    print(f"📂 Data directory: {data_dir}")
+    print(f"📹 Video directory: {video_dir}")
+    print(f"📝 Prompt directory: {prompt_dir}")
+    print(f"📊 Excel file: {excel_path}")
+    print(f"📁 Output directory: {output_dir}")
+    print(f"🎲 Random seed: {args.random_seed}")
+    print(f"📊 Eval size: {args.eval_size * 100:.0f}%")
+
+    # Read Excel file
+    print("\n📖 Reading Excel file...")
+    df = pd.read_excel(excel_path, skiprows=1, names=["video_url", "label"])
+    print(f"Found {len(df)} labeled videos")
+    print(f"Label distribution: {df['label'].value_counts().to_dict()}")
+
+    # Extract filenames from URLs
+    df["filename"] = df["video_url"].apply(extract_filename_from_url)
+    missing_filenames = df["filename"].isna().sum()
+    if missing_filenames > 0:
+        print(f"⚠️  Warning: {missing_filenames} URLs couldn't be parsed")
+        df = df[df["filename"].notna()].reset_index(drop=True)
+
+    # Verify files exist
+    df["video_path"] = df["filename"].apply(lambda x: str(video_dir / x))
+    df["prompt_path"] = df["filename"].apply(
+        lambda x: str(prompt_dir / x.replace("_output.mp4", "_prompt.txt"))
+    )
+    
+    df["video_exists"] = df["video_path"].apply(os.path.exists)
+    df["prompt_exists"] = df["prompt_path"].apply(os.path.exists)
+
+    missing_videos = (~df["video_exists"]).sum()
+    missing_prompts = (~df["prompt_exists"]).sum()
+    
+    if missing_videos > 0:
+        print(f"⚠️  Warning: {missing_videos} videos not found locally")
+    if missing_prompts > 0:
+        print(f"⚠️  Warning: {missing_prompts} prompts not found locally")
+    
+    df = df[df["video_exists"] & df["prompt_exists"]].reset_index(drop=True)
+    print(f"✅ {len(df)} samples have both video and prompt files")
+
+    # Scale labels if requested
+    if args.scale_labels:
+        print("\n🔄 Scaling labels: 0→1 (bad physics), 1→5 (good physics)")
+        df["pc"] = df["label"].apply(lambda x: 1 if x == 0 else 5)
+    else:
+        df["pc"] = df["label"]
+
+    # Read prompts
+    print("\n📝 Reading prompt files...")
+    prompts = []
+    for prompt_path in tqdm(df["prompt_path"], desc="Loading prompts"):
+        try:
+            with open(prompt_path, "r") as f:
+                prompt_text = f.read().strip()
+                prompts.append(prompt_text)
+        except Exception as e:
+            print(f"⚠️  Error reading {prompt_path}: {e}")
+            prompts.append("")
+    
+    df["caption"] = prompts
+
+    # Create dataset
+    dataset_dict = {
+        "caption": df["caption"].tolist(),
+        "video_url": df["video_path"].tolist(),
+        "pc": df["pc"].tolist(),
+    }
+
+    full_dataset = datasets.Dataset.from_dict(dataset_dict)
+    print(f"\n📦 Created full dataset with {len(full_dataset)} samples")
+
+    # Balance if requested (before splitting)
+    if args.balance_labels:
+        print("\n⚖️  Balancing dataset labels...")
+        full_dataset = balance_dataset_labels(full_dataset)
+
+    # Perform stratified split
+    print(f"\n✂️  Splitting dataset: {(1-args.eval_size)*100:.0f}% train, {args.eval_size*100:.0f}% eval")
+    train_dataset, eval_dataset = stratified_split(full_dataset, eval_size=args.eval_size, random_seed=args.random_seed)
+
+    # Print split statistics
+    print(f"\n📊 Split Statistics:")
+    print(f"  Train: {len(train_dataset)} samples")
+    train_label_counts = Counter(train_dataset["pc"])
+    for label in sorted(train_label_counts.keys()):
+        print(f"    Label {label}: {train_label_counts[label]} samples ({train_label_counts[label]/len(train_dataset)*100:.1f}%)")
+    
+    print(f"\n  Eval: {len(eval_dataset)} samples")
+    eval_label_counts = Counter(eval_dataset["pc"])
+    for label in sorted(eval_label_counts.keys()):
+        print(f"    Label {label}: {eval_label_counts[label]} samples ({eval_label_counts[label]/len(eval_dataset)*100:.1f}%)")
+
+    # Save datasets
+    train_path = output_dir / "train"
+    eval_path = output_dir / "eval"
+    
+    print(f"\n💾 Saving datasets...")
+    train_dataset.save_to_disk(str(train_path))
+    eval_dataset.save_to_disk(str(eval_path))
+    
+    print(f"✅ Train dataset saved to: {train_path}")
+    print(f"✅ Eval dataset saved to: {eval_path}")
+
+    # Save split info
+    split_info = {
+        "total_samples": len(full_dataset),
+        "train_samples": len(train_dataset),
+        "eval_samples": len(eval_dataset),
+        "eval_fraction": args.eval_size,
+        "random_seed": args.random_seed,
+        "balanced": args.balance_labels,
+        "scaled_labels": args.scale_labels,
+        "train_label_distribution": dict(train_label_counts),
+        "eval_label_distribution": dict(eval_label_counts),
+    }
+    
+    split_info_path = output_dir / "split_info.json"
+    with open(split_info_path, "w") as f:
+        json.dump(split_info, f, indent=2)
+    
+    print(f"📄 Split info saved to: {split_info_path}")
+
+    print("\n" + "="*80)
+    print("✅ DATASET CREATION COMPLETE!")
+    print("="*80)
+    print(f"\nDataset locations:")
+    print(f"  Train: {train_path}")
+    print(f"  Eval:  {eval_path}")
+    print(f"\nTo load in Python:")
+    print(f"  import datasets")
+    print(f"  train_ds = datasets.load_from_disk('{train_path}')")
+    print(f"  eval_ds = datasets.load_from_disk('{eval_path}')")
+    print("="*80)
+
+
+if __name__ == "__main__":
+    main()
+
+
diff --git a/scripts/examples/reason1/physical-plausibility-check/evaluate_model.py b/scripts/examples/reason1/physical-plausibility-check/evaluate_model.py
new file mode 100644
index 0000000..06562a6
--- /dev/null
+++ b/scripts/examples/reason1/physical-plausibility-check/evaluate_model.py
@@ -0,0 +1,514 @@
+#!/usr/bin/env python3
+"""
+Evaluate Fine-tuned Cosmos Reason 1 Model on Transfer1 Evaluation Dataset
+
+Usage:
+    python3 scripts/evaluate_model.py \
+        --model_path outputs/transfer1_sft/20251023145904/checkpoints/step_80/policy \
+        --eval_dataset data/transfer1_split_with_conv/eval \
+        --prompt_path prompts/video_reward.yaml \
+        --output_dir eval_results
+"""
+
+import argparse
+import json
+import re
+import time
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from collections import Counter, defaultdict
+
+import yaml
+from datasets import load_from_disk
+from qwen_vl_utils import process_vision_info
+from transformers import AutoProcessor
+from vllm import LLM, SamplingParams
+
+
+def parse_response(response):
+    """Parse response to extract integer score from <answer></answer> tags."""
+    try:
+        # Try XML parsing first
+        wrapped = f"<root>{response.strip()}</root>"
+        root = ET.fromstring(wrapped)
+        answer_element = root.find("answer")
+
+        if answer_element is not None and answer_element.text:
+            answer_text = answer_element.text.strip()
+            try:
+                answer_int = int(answer_text)
+                # Ensure score is in valid range
+                if 1 <= answer_int <= 5:
+                    return answer_int
+            except ValueError:
+                pass
+
+        # Try regex as fallback
+        match = re.search(r"<answer>\s*(\d+)\s*</answer>", response)
+        if match:
+            try:
+                answer_int = int(match.group(1))
+                if 1 <= answer_int <= 5:
+                    return answer_int
+            except ValueError:
+                pass
+
+    except Exception:
+        pass
+
+    return None
+
+
+def load_prompt_config(prompt_path):
+    """Load prompt configuration from YAML file."""
+    with open(prompt_path, 'r') as f:
+        config = yaml.safe_load(f)
+    return config.get('system_prompt', ''), config.get('user_prompt', '')
+
+
+def run_inference_batch(llm, processor, video_paths, system_prompt, user_prompt, batch_size=4):
+    """Run inference on a batch of videos."""
+    sampling_params = SamplingParams(
+        temperature=0.1,  # Low temperature for more deterministic outputs
+        top_k=10,
+        top_p=0.9,
+        repetition_penalty=1.05,
+        max_tokens=512,  # Shorter for evaluation
+    )
+
+    results = []
+    
+    for i in range(0, len(video_paths), batch_size):
+        batch_videos = video_paths[i:i+batch_size]
+        batch_inputs = []
+        
+        for video_path in batch_videos:
+            messages = [
+                {"role": "system", "content": system_prompt},
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "video",
+                            "video": video_path,
+                            "fps": 16,
+                            "total_pixels": 8192 * 28 * 28,
+                        },
+                        {"type": "text", "text": user_prompt},
+                    ],
+                },
+            ]
+            
+            prompt = processor.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+            
+            image_inputs, video_inputs, video_kwargs = process_vision_info(
+                messages, return_video_kwargs=True
+            )
+            
+            mm_data = {}
+            if image_inputs is not None:
+                mm_data["image"] = image_inputs
+            if video_inputs is not None:
+                mm_data["video"] = video_inputs
+            
+            batch_inputs.append({
+                "prompt": prompt,
+                "multi_modal_data": mm_data,
+                "mm_processor_kwargs": video_kwargs,
+            })
+        
+        # Generate responses for batch
+        outputs = llm.generate(batch_inputs, sampling_params=sampling_params)
+        
+        for output in outputs:
+            response_text = output.outputs[0].text
+            predicted_score = parse_response(response_text)
+            results.append({
+                'response': response_text,
+                'predicted_score': predicted_score
+            })
+    
+    return results
+
+
+def calculate_metrics(predictions, ground_truths):
+    """Calculate evaluation metrics."""
+    # Filter out failed predictions
+    valid_pairs = [(pred, gt) for pred, gt in zip(predictions, ground_truths) if pred is not None]
+    
+    if not valid_pairs:
+        return None
+    
+    predictions_valid = [p for p, _ in valid_pairs]
+    ground_truths_valid = [g for _, g in valid_pairs]
+    
+    # Exact accuracy
+    exact_matches = sum(1 for pred, gt in valid_pairs if pred == gt)
+    exact_accuracy = exact_matches / len(valid_pairs)
+    
+    # Accuracy within 1 point
+    within_1 = sum(1 for pred, gt in valid_pairs if abs(pred - gt) <= 1)
+    within_1_accuracy = within_1 / len(valid_pairs)
+    
+    # Mean Absolute Error
+    mae = sum(abs(pred - gt) for pred, gt in valid_pairs) / len(valid_pairs)
+    
+    # Confusion matrix
+    confusion_matrix = defaultdict(lambda: defaultdict(int))
+    for pred, gt in valid_pairs:
+        confusion_matrix[gt][pred] += 1
+    
+    # Binary classification metrics (1 vs 5)
+    # Ground truth: 1 = bad, 5 = good
+    binary_predictions = [1 if p <= 2 else (5 if p >= 4 else 3) for p in predictions_valid]
+    binary_ground_truth = [g for g in ground_truths_valid]
+    
+    # True positives, false positives, etc. for score 1 (bad videos)
+    tp_bad = sum(1 for pred, gt in zip(binary_predictions, binary_ground_truth) if pred == 1 and gt == 1)
+    fp_bad = sum(1 for pred, gt in zip(binary_predictions, binary_ground_truth) if pred == 1 and gt != 1)
+    tn_bad = sum(1 for pred, gt in zip(binary_predictions, binary_ground_truth) if pred != 1 and gt != 1)
+    fn_bad = sum(1 for pred, gt in zip(binary_predictions, binary_ground_truth) if pred != 1 and gt == 1)
+    
+    # True positives, false positives, etc. for score 5 (good videos)
+    tp_good = sum(1 for pred, gt in zip(binary_predictions, binary_ground_truth) if pred == 5 and gt == 5)
+    fp_good = sum(1 for pred, gt in zip(binary_predictions, binary_ground_truth) if pred == 5 and gt != 5)
+    tn_good = sum(1 for pred, gt in zip(binary_predictions, binary_ground_truth) if pred != 5 and gt != 5)
+    fn_good = sum(1 for pred, gt in zip(binary_predictions, binary_ground_truth) if pred != 5 and gt == 5)
+    
+    # Precision, Recall, F1 for bad videos
+    precision_bad = tp_bad / (tp_bad + fp_bad) if (tp_bad + fp_bad) > 0 else 0
+    recall_bad = tp_bad / (tp_bad + fn_bad) if (tp_bad + fn_bad) > 0 else 0
+    f1_bad = 2 * precision_bad * recall_bad / (precision_bad + recall_bad) if (precision_bad + recall_bad) > 0 else 0
+    
+    # Precision, Recall, F1 for good videos
+    precision_good = tp_good / (tp_good + fp_good) if (tp_good + fp_good) > 0 else 0
+    recall_good = tp_good / (tp_good + fn_good) if (tp_good + fn_good) > 0 else 0
+    f1_good = 2 * precision_good * recall_good / (precision_good + recall_good) if (precision_good + recall_good) > 0 else 0
+    
+    return {
+        'total_samples': len(predictions),
+        'valid_predictions': len(valid_pairs),
+        'failed_predictions': len(predictions) - len(valid_pairs),
+        'exact_accuracy': exact_accuracy,
+        'within_1_accuracy': within_1_accuracy,
+        'mean_absolute_error': mae,
+        'confusion_matrix': dict(confusion_matrix),
+        'binary_metrics': {
+            'bad_videos': {
+                'precision': precision_bad,
+                'recall': recall_bad,
+                'f1_score': f1_bad,
+                'true_positives': tp_bad,
+                'false_positives': fp_bad,
+                'true_negatives': tn_bad,
+                'false_negatives': fn_bad,
+            },
+            'good_videos': {
+                'precision': precision_good,
+                'recall': recall_good,
+                'f1_score': f1_good,
+                'true_positives': tp_good,
+                'false_positives': fp_good,
+                'true_negatives': tn_good,
+                'false_negatives': fn_good,
+            }
+        },
+        'score_distribution': {
+            'predictions': dict(Counter(predictions_valid)),
+            'ground_truth': dict(Counter(ground_truths_valid)),
+        }
+    }
+
+
+def generate_html_report(results, metrics, output_path):
+    """Generate HTML evaluation report."""
+    html = """<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>Fine-tuned Model Evaluation Report</title>
+    <style>
+        body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; margin: 0; padding: 20px; background: #f5f5f5; }
+        .container { max-width: 1200px; margin: 0 auto; background: white; padding: 30px; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }
+        h1 { color: #667eea; text-align: center; }
+        .metrics { display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 20px; margin: 30px 0; }
+        .metric-card { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 8px; }
+        .metric-label { font-size: 14px; opacity: 0.9; }
+        .metric-value { font-size: 32px; font-weight: bold; margin-top: 10px; }
+        .confusion-matrix { margin: 30px 0; }
+        .confusion-matrix table { border-collapse: collapse; width: 100%; max-width: 600px; margin: 20px auto; }
+        .confusion-matrix th, .confusion-matrix td { border: 1px solid #ddd; padding: 12px; text-align: center; }
+        .confusion-matrix th { background: #667eea; color: white; }
+        .result-item { border: 1px solid #ddd; padding: 15px; margin: 10px 0; border-radius: 5px; }
+        .correct { background: #e8f5e9; }
+        .incorrect { background: #ffebee; }
+        .failed { background: #fff3e0; }
+        .score { font-size: 18px; font-weight: bold; display: inline-block; padding: 5px 15px; border-radius: 5px; margin: 5px; }
+        .score.pred { background: #bbdefb; }
+        .score.truth { background: #c8e6c9; }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>Fine-tuned Model Evaluation Report</h1>
+        <p style="text-align: center; color: #666;">Cosmos Reason 1 - Transfer1 Dataset Evaluation</p>
+"""
+
+    # Metrics section
+    html += """
+        <h2>Overall Metrics</h2>
+        <div class="metrics">
+"""
+    
+    if metrics:
+        html += f"""
+            <div class="metric-card">
+                <div class="metric-label">Exact Accuracy</div>
+                <div class="metric-value">{metrics['exact_accuracy']:.1%}</div>
+            </div>
+            <div class="metric-card">
+                <div class="metric-label">Within ±1 Accuracy</div>
+                <div class="metric-value">{metrics['within_1_accuracy']:.1%}</div>
+            </div>
+            <div class="metric-card">
+                <div class="metric-label">Mean Absolute Error</div>
+                <div class="metric-value">{metrics['mean_absolute_error']:.2f}</div>
+            </div>
+            <div class="metric-card">
+                <div class="metric-label">Valid Predictions</div>
+                <div class="metric-value">{metrics['valid_predictions']}/{metrics['total_samples']}</div>
+            </div>
+        </div>
+        
+        <h2>Binary Classification Metrics</h2>
+        <div class="metrics">
+            <div class="metric-card">
+                <div class="metric-label">Bad Videos F1-Score</div>
+                <div class="metric-value">{metrics['binary_metrics']['bad_videos']['f1_score']:.1%}</div>
+                <div style="font-size: 12px; margin-top: 10px;">
+                    Precision: {metrics['binary_metrics']['bad_videos']['precision']:.1%} | 
+                    Recall: {metrics['binary_metrics']['bad_videos']['recall']:.1%}
+                </div>
+            </div>
+            <div class="metric-card">
+                <div class="metric-label">Good Videos F1-Score</div>
+                <div class="metric-value">{metrics['binary_metrics']['good_videos']['f1_score']:.1%}</div>
+                <div style="font-size: 12px; margin-top: 10px;">
+                    Precision: {metrics['binary_metrics']['good_videos']['precision']:.1%} | 
+                    Recall: {metrics['binary_metrics']['good_videos']['recall']:.1%}
+                </div>
+            </div>
+        </div>
+"""
+
+    # Confusion matrix
+    if metrics and metrics.get('confusion_matrix'):
+        html += """
+        <div class="confusion-matrix">
+            <h2>Confusion Matrix</h2>
+            <table>
+                <tr>
+                    <th>Ground Truth \\ Predicted</th>
+"""
+        # Get all scores
+        all_scores = sorted(set(list(metrics['confusion_matrix'].keys()) + 
+                              [pred for preds in metrics['confusion_matrix'].values() for pred in preds.keys()]))
+        
+        for score in all_scores:
+            html += f"<th>Score {score}</th>"
+        html += "</tr>"
+        
+        for gt_score in all_scores:
+            html += f"<tr><th>Score {gt_score}</th>"
+            for pred_score in all_scores:
+                count = metrics['confusion_matrix'].get(gt_score, {}).get(pred_score, 0)
+                html += f"<td>{count}</td>"
+            html += "</tr>"
+        
+        html += """
+            </table>
+        </div>
+"""
+
+    # Detailed results
+    html += """
+        <h2>Detailed Results</h2>
+"""
+    
+    for i, result in enumerate(results[:50], 1):  # Show first 50 results
+        video_name = Path(result['video_path']).name
+        pred_score = result['predicted_score']
+        gt_score = result['ground_truth']
+        
+        if pred_score is None:
+            css_class = "failed"
+            status = "Failed to Parse"
+        elif pred_score == gt_score:
+            css_class = "correct"
+            status = "Correct"
+        else:
+            css_class = "incorrect"
+            status = "Incorrect"
+        
+        html += f"""
+        <div class="result-item {css_class}">
+            <div><strong>{status} - Sample {i}</strong></div>
+            <div style="color: #666; font-size: 12px; margin: 5px 0;">{video_name}</div>
+            <div>
+                <span class="score truth">Ground Truth: {gt_score}</span>
+                <span class="score pred">Predicted: {pred_score if pred_score else 'N/A'}</span>
+            </div>
+            <details style="margin-top: 10px;">
+                <summary style="cursor: pointer; color: #667eea;">Show Response</summary>
+                <pre style="background: #f5f5f5; padding: 10px; border-radius: 5px; font-size: 12px; overflow-x: auto;">{result['response']}</pre>
+            </details>
+        </div>
+"""
+    
+    if len(results) > 50:
+        html += f'<p style="text-align: center; color: #666; margin: 20px 0;">... and {len(results) - 50} more results</p>'
+    
+    html += """
+    </div>
+</body>
+</html>
+"""
+    
+    with open(output_path, 'w') as f:
+        f.write(html)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate fine-tuned model on evaluation dataset")
+    parser.add_argument("--model_path", type=str, required=True, help="Path to fine-tuned model checkpoint")
+    parser.add_argument("--eval_dataset", type=str, required=True, help="Path to evaluation dataset")
+    parser.add_argument("--prompt_path", type=str, default="prompts/video_reward.yaml", help="Path to prompt config")
+    parser.add_argument("--output_dir", type=str, default="eval_results", help="Output directory for results")
+    parser.add_argument("--batch_size", type=int, default=4, help="Batch size for inference")
+    args = parser.parse_args()
+    
+    # Create output directory
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(exist_ok=True, parents=True)
+    
+    print("=" * 80)
+    print("Fine-tuned Model Evaluation")
+    print("=" * 80)
+    print(f"Model: {args.model_path}")
+    print(f"Dataset: {args.eval_dataset}")
+    print(f"Output: {args.output_dir}")
+    print("=" * 80)
+    print()
+    
+    # Load prompt configuration
+    print("Loading prompt configuration...")
+    system_prompt, user_prompt = load_prompt_config(args.prompt_path)
+    
+    # Load evaluation dataset
+    print("Loading evaluation dataset...")
+    eval_dataset = load_from_disk(args.eval_dataset)
+    print(f"   Loaded {len(eval_dataset)} samples")
+    print()
+    
+    # Load fine-tuned model
+    print("Loading fine-tuned model...")
+    print("   (This may take a few minutes...)")
+    llm = LLM(
+        model=args.model_path,
+        limit_mm_per_prompt={"image": 0, "video": 1},
+        enforce_eager=True,
+    )
+    processor = AutoProcessor.from_pretrained(args.model_path)
+    print("   Model loaded successfully")
+    print()
+    
+    # Run inference
+    print("Running inference...")
+    video_paths = [sample['video_url'] for sample in eval_dataset]
+    ground_truths = [sample['pc'] for sample in eval_dataset]
+    
+    start_time = time.time()
+    inference_results = run_inference_batch(
+        llm, processor, video_paths, system_prompt, user_prompt, 
+        batch_size=args.batch_size
+    )
+    elapsed_time = time.time() - start_time
+    
+    print(f"   Inference completed in {elapsed_time:.1f} seconds")
+    print(f"   Average: {elapsed_time/len(eval_dataset):.2f} seconds/sample")
+    print()
+    
+    # Combine results
+    results = []
+    predictions = []
+    for i, (sample, inference_result) in enumerate(zip(eval_dataset, inference_results)):
+        result = {
+            'video_path': sample['video_url'],
+            'ground_truth': sample['pc'],
+            'predicted_score': inference_result['predicted_score'],
+            'response': inference_result['response'],
+            'caption': sample['caption']
+        }
+        results.append(result)
+        predictions.append(inference_result['predicted_score'])
+    
+    # Calculate metrics
+    print("Calculating metrics...")
+    metrics = calculate_metrics(predictions, ground_truths)
+    
+    if metrics:
+        print()
+        print("=" * 80)
+        print("EVALUATION RESULTS")
+        print("=" * 80)
+        print(f"Total Samples:        {metrics['total_samples']}")
+        print(f"Valid Predictions:    {metrics['valid_predictions']}")
+        print(f"Failed Predictions:   {metrics['failed_predictions']}")
+        print(f"Exact Accuracy:       {metrics['exact_accuracy']:.2%}")
+        print(f"Within ±1 Accuracy:   {metrics['within_1_accuracy']:.2%}")
+        print(f"Mean Absolute Error:  {metrics['mean_absolute_error']:.3f}")
+        print()
+        print("Binary Classification (Bad vs Good):")
+        print(f"  Bad Videos F1:      {metrics['binary_metrics']['bad_videos']['f1_score']:.2%}")
+        print(f"  Good Videos F1:     {metrics['binary_metrics']['good_videos']['f1_score']:.2%}")
+        print("=" * 80)
+    
+    # Save results
+    print()
+    print("Saving results...")
+    
+    # Save JSON
+    json_path = output_dir / "evaluation_results.json"
+    with open(json_path, 'w') as f:
+        json.dump({
+            'metrics': metrics,
+            'results': results,
+            'config': {
+                'model_path': args.model_path,
+                'eval_dataset': args.eval_dataset,
+                'num_samples': len(eval_dataset),
+            }
+        }, f, indent=2)
+    print(f"   JSON: {json_path}")
+    
+    # Generate HTML report
+    html_path = output_dir / "evaluation_report.html"
+    generate_html_report(results, metrics, html_path)
+    print(f"   HTML: {html_path}")
+    
+    print()
+    print("=" * 80)
+    print("Evaluation completed successfully!")
+    print(f"Results saved to: {args.output_dir}")
+    print(f"Open the HTML report: {html_path}")
+    print("=" * 80)
+
+
+if __name__ == "__main__":
+    main()
+
+