Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions skyrl-agent/examples/run_tinker/tinker_osworld.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/bin/bash
# set -x

# =============================================================================
# Tinker RL Training for MemAgent Task
# =============================================================================
# This script demonstrates how to train a model on ruler/hotpotqa using:
# - GRPO (Group Relative Policy Optimization) for advantages
# - PPO loss for stable training
# - MemAgent tool with multi-turn interactions
# =============================================================================

# Data paths
DATASET_FILE="/home/ubuntu/shuo/osworld/OSWorld_llm_agentsynth/osworld_train_8.parquet"

EVAL_DATASET_FILE="/home/ubuntu/shuo/osworld/OSWorld_llm_agentsynth/osworld_train_8.parquet"

# Output directory
NAME="${NAME:-jan03_qwen3_8b_osworld_tinker_lr4e_5_rank128}"
OUTPUT_DIR="/home/ubuntu/shuo/osworld/checkpoints/${NAME}"
Comment on lines +14 to +20
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The script contains hardcoded absolute paths for datasets and the output directory, which include a specific user's home directory (/home/ubuntu/shuo). This makes the script not portable and difficult for other users to run without modification.

It's a best practice to avoid hardcoding user-specific paths. You can make these paths configurable, for example, by using environment variables with sensible defaults, similar to how other variables like MODEL_NAME are handled in this script.

Suggested change
DATASET_FILE="/home/ubuntu/shuo/osworld/OSWorld_llm_agentsynth/osworld_train_8.parquet"
EVAL_DATASET_FILE="/home/ubuntu/shuo/osworld/OSWorld_llm_agentsynth/osworld_train_8.parquet"
# Output directory
NAME="${NAME:-jan03_qwen3_8b_osworld_tinker_lr4e_5_rank128}"
OUTPUT_DIR="/home/ubuntu/shuo/osworld/checkpoints/${NAME}"
DATASET_FILE="${DATASET_FILE:-/path/to/your/osworld_train_8.parquet}"
EVAL_DATASET_FILE="${EVAL_DATASET_FILE:-/path/to/your/osworld_train_8.parquet}"
# Output directory
NAME="${NAME:-jan03_qwen3_8b_osworld_tinker_lr4e_5_rank128}"
OUTPUT_DIR="${OUTPUT_DIR:-/path/to/your/checkpoints/${NAME}}"

mkdir -p "$OUTPUT_DIR"

# Model configuration
MODEL_NAME="${MODEL_NAME:-Qwen/Qwen3-8B}"
LORA_RANK="${LORA_RANK:-128}"

# Training hyperparameters
BATCH_SIZE="${BATCH_SIZE:-8}"
LEARNING_RATE="${LEARNING_RATE:-4e-5}"
MAX_STEPS="${MAX_STEPS:-50}"
SAVE_EVERY="${SAVE_EVERY:-5}"
EVAL_EVERY="${EVAL_EVERY:-10}"

# RL configuration
LOSS_FN="${LOSS_FN:-ppo}"
GROUP_SIZE="${GROUP_SIZE:-8}" # Should match num_trajectories in YAML
NORMALIZE_ADVANTAGES="${NORMALIZE_ADVANTAGES:-false}"

# Logging
WANDB_PROJECT="${WANDB_PROJECT:-tinker-osw}"
WANDB_NAME="${WANDB_NAME:-${NAME}}"

# Task configuration
TASK_YAML="./examples/run_tinker/tinker_osworld.yaml"

echo "================================================"
echo "Tinker RL Training Configuration - OSWorld"
echo "================================================"
echo "Model: $MODEL_NAME"
echo "Dataset: $DATASET_FILE"
echo "Task YAML: $TASK_YAML"
echo "Batch Size: $BATCH_SIZE"
echo "Group Size (GRPO): $GROUP_SIZE"
echo "Max Steps: $MAX_STEPS"
echo "Output: $OUTPUT_DIR"
echo "================================================"

# Run training
# UV_NO_SYNC=1 prevents uv from trying to (re)install dependencies (like vllm);
# make sure required deps are already installed in the active env.
LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1 UV_NO_SYNC=1 uv run --active --extra tinker --env-file .env -m skyrl_agent.integrations.tinker.tinker_train \
model_name="$MODEL_NAME" \
skyrl_agent_task_yaml="$TASK_YAML" \
dataset_file="$DATASET_FILE" \
eval_dataset_file="$EVAL_DATASET_FILE" \
batch_size="$BATCH_SIZE" \
learning_rate="$LEARNING_RATE" \
lora_rank="$LORA_RANK" \
max_steps="$MAX_STEPS" \
save_every="$SAVE_EVERY" \
loss_fn="$LOSS_FN" \
group_size="$GROUP_SIZE" \
normalize_advantages="$NORMALIZE_ADVANTAGES" \
wandb_project="$WANDB_PROJECT" \
wandb_name="$WANDB_NAME" \
log_dir="$OUTPUT_DIR" \
"$@"

echo "================================================"
echo "Training completed!"
echo "Checkpoints saved to: ${OUTPUT_DIR}/${WANDB_NAME}_*"
echo "================================================"
39 changes: 39 additions & 0 deletions skyrl-agent/examples/run_tinker/tinker_osworld.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
agent_cls: skyrl_agent.agents.react.ReActAgent

task: skyrl_agent.tasks.osworld.osworld_task.OSWorldTask

tools: ["finish", "osworld_action"]

data:
instance_key: instance
data_source_key: data_source

generator:
infer_backend: tinker
use_cpu_node: false
backend_config: null
num_trajectories: 8 # need to be the same as the num_trajectories in the verl config
max_iterations: 15
max_prompt_length: 15000
sampling_params:
temperature: 1.0
top_p: 0.95
max_tokens: 3000
val_config:
num_trajectories: 1
sampling_params:
temperature: 0.6
top_p: 0.95
max_tokens: 3000
remove_think_tokens: false
vision_is_active: false
qwen3_enable_thinking: true
qwen3_acc_thinking: false
history_length: 3
path_to_vm: "/path/to/your/vm.qcow2"

dispatcher:
type: async_fix_pool
scheduler: naive
max_parallel_agents: 8
max_eval_parallel_agents: 8
Loading
Loading