Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/docker/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ services:
- RAY_ADDRESS=auto
- CHECKPOINT_ROOT_DIR=/mnt/checkpoints
- DATA_ROOT_DIR=/mnt/data
- MODEL_PATH=/mnt/models/Qwen3-1.7B
- MODEL_PATH=/mnt/models/Qwen3-0.6B
- CHECKPOINT_PATH=/mnt/checkpoints
working_dir: /workspace
networks:
Expand Down
6 changes: 3 additions & 3 deletions docs/sphinx_doc/source/tutorial/trinity_configs.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,14 +129,14 @@ Defines the model paths and token limits.
model:
model_path: /PATH/TO/MODEL/
critic_model_path: ''
max_prompt_tokens: 4096
max_response_tokens: 16384
max_model_len: 20480
```

- `model_path`: Path to the model being trained.
- `critic_model_path`: Optional path to a separate critic model. If empty, defaults to `model_path`.
- `max_prompt_tokens`: Maximum number of tokens allowed in input prompts.
- `max_response_tokens`: Maximum number of tokens allowed in generated responses.
- `max_model_len`: Maximum number of tokens in a sequence.

---

Expand Down Expand Up @@ -444,7 +444,7 @@ actor_rollout_ref:
# ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: 4
use_dynamic_bsz: True
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_model_len}
grad_clip: 1.0
ppo_epochs: 1
shuffle: False
Expand Down
2 changes: 1 addition & 1 deletion examples/async_gsm8k/explorer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ algorithm:
repeat_times: 8
model:
model_path: /PATH/TO/MODEL/
max_prompt_tokens: 256
max_response_tokens: 1024
max_model_len: 1280
cluster:
node_num: 1
gpu_per_node: 4
Expand Down
2 changes: 1 addition & 1 deletion examples/async_gsm8k/trainer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ algorithm:
repeat_times: 8
model:
model_path: /PATH/TO/MODEL/
max_prompt_tokens: 256
max_response_tokens: 1024
max_model_len: 1280
cluster:
node_num: 1
gpu_per_node: 4
Expand Down
2 changes: 1 addition & 1 deletion examples/async_gsm8k/verl_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ actor_rollout_ref:
strategy: fsdp # This is for backward-compatibility
ppo_micro_batch_size_per_gpu: 4
use_dynamic_bsz: True # False
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
ppo_max_token_len_per_gpu: 16384
grad_clip: 1.0
ppo_epochs: 1
shuffle: False
Expand Down
4 changes: 2 additions & 2 deletions examples/dapo_math/dapo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ name: dapo
checkpoint_root_dir: /PATH/TO/CHECKPOINT/
model:
model_path: /PATH/TO/MODEL/
max_response_tokens: 20480
max_model_len: 21504
algorithm:
algorithm_type: grpo
repeat_times: 16
Expand Down Expand Up @@ -66,8 +68,6 @@ explorer:
enable_prefix_caching: false
enforce_eager: true
dtype: bfloat16
max_prompt_tokens: 1024
max_response_tokens: 20480
seed: 42
synchronizer:
sync_method: 'nccl'
Expand Down
2 changes: 1 addition & 1 deletion examples/dapo_math/train_dapo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ actor_rollout_ref:
strategy: fsdp # This is for backward-compatibility
ppo_micro_batch_size_per_gpu: 4
use_dynamic_bsz: True # False
ppo_max_token_len_per_gpu: 22000 # n * ${data.max_prompt_length} + ${data.max_response_length}
ppo_max_token_len_per_gpu: 22000
grad_clip: 1.0
ppo_epochs: 1
shuffle: False
Expand Down
2 changes: 1 addition & 1 deletion examples/dpo_humanlike/dpo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ algorithm:
checkpoint_root_dir: /PATH/TO/CHECKPOINT/
model:
model_path: /PATH/TO/MODEL
max_prompt_tokens: 512
max_response_tokens: 1024
max_model_len: 1536
cluster:
node_num: 1
gpu_per_node: 8
Expand Down
2 changes: 1 addition & 1 deletion examples/dpo_humanlike/train_dpo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ actor_rollout_ref:
strategy: fsdp # This is for backward-compatibility
ppo_micro_batch_size_per_gpu: 2 # NOTE
use_dynamic_bsz: False
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
ppo_max_token_len_per_gpu: 16384
grad_clip: 1.0
ppo_epochs: 1
shuffle: False
Expand Down
2 changes: 1 addition & 1 deletion examples/grpo_alfworld/alfworld.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ algorithm:
repeat_times: 8
model:
model_path: /PATH/TO/MODEL/
max_prompt_tokens: 4096
max_response_tokens: 16384
max_model_len: 20480
cluster:
node_num: 1
gpu_per_node: 8
Expand Down
2 changes: 1 addition & 1 deletion examples/grpo_alfworld/train_alfworld.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ actor_rollout_ref:
strategy: fsdp # This is for backward-compatibility
ppo_micro_batch_size_per_gpu: 1
use_dynamic_bsz: False
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
ppo_max_token_len_per_gpu: 16384
grad_clip: 1.0
ppo_epochs: 1
shuffle: False
Expand Down
3 changes: 1 addition & 2 deletions examples/grpo_gsm8k/gsm8k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,10 @@ checkpoint_root_dir: /PATH/TO/CHECKPOINT/
algorithm:
algorithm_type: grpo
repeat_times: 8

model:
model_path: /PATH/TO/MODEL/
max_prompt_tokens: 256
max_response_tokens: 1024
max_model_len: 1280
cluster:
node_num: 1
gpu_per_node: 8
Expand Down
2 changes: 1 addition & 1 deletion examples/grpo_gsm8k/train_gsm8k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ actor_rollout_ref:
strategy: fsdp # This is for backward-compatibility
ppo_micro_batch_size_per_gpu: 4
use_dynamic_bsz: True # False
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
ppo_max_token_len_per_gpu: 16384
grad_clip: 1.0
ppo_epochs: 1
shuffle: False
Expand Down
3 changes: 1 addition & 2 deletions examples/grpo_gsm8k_experience_pipeline/gsm8k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,10 @@ data_processor:
- stats_key: 'llm_quality_score'
op_type: ADD
weight: 1.0

model:
model_path: /PATH/TO/MODEL/
max_prompt_tokens: 256
max_response_tokens: 1024
max_model_len: 1280
cluster:
node_num: 1
gpu_per_node: 8
Expand Down
2 changes: 1 addition & 1 deletion examples/grpo_gsm8k_experience_pipeline/train_gsm8k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ actor_rollout_ref:
strategy: fsdp # This is for backward-compatibility
ppo_micro_batch_size_per_gpu: 4
use_dynamic_bsz: True # False
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
ppo_max_token_len_per_gpu: 16384
grad_clip: 1.0
ppo_epochs: 1
shuffle: False
Expand Down
3 changes: 1 addition & 2 deletions examples/grpo_gsm8k_task_pipeline/gsm8k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,10 @@ data_processor:
dj_process_desc: 'Please compute difficulty scores for these math questions.'
agent_model_name: 'qwen-max'
clean_strategy: 'iterative'

model:
model_path: /PATH/TO/MODEL/
max_prompt_tokens: 256
max_response_tokens: 1024
max_model_len: 1280
cluster:
node_num: 1
gpu_per_node: 8
Expand Down
2 changes: 1 addition & 1 deletion examples/grpo_gsm8k_task_pipeline/train_gsm8k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ actor_rollout_ref:
strategy: fsdp # This is for backward-compatibility
ppo_micro_batch_size_per_gpu: 4
use_dynamic_bsz: True # False
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
ppo_max_token_len_per_gpu: 16384
grad_clip: 1.0
ppo_epochs: 1
shuffle: False
Expand Down
4 changes: 2 additions & 2 deletions examples/grpo_math/math.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ name: grpo_math_example
checkpoint_root_dir: /PATH/TO/CHECKPOINT/
model:
model_path: /PATH/TO/MODEL/
max_response_tokens: 3072
max_model_len: 4096
algorithm:
algorithm_type: grpo
repeat_times: 8
Expand Down Expand Up @@ -44,8 +46,6 @@ explorer:
enable_prefix_caching: false
enforce_eager: true
dtype: bfloat16
max_prompt_tokens: 1024
max_response_tokens: 3072
seed: 42
synchronizer:
sync_method: 'nccl'
Expand Down
2 changes: 1 addition & 1 deletion examples/grpo_math/train_math.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ actor_rollout_ref:
strategy: fsdp # This is for backward-compatibility
ppo_micro_batch_size_per_gpu: 4
use_dynamic_bsz: True # False
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
ppo_max_token_len_per_gpu: 16384
grad_clip: 1.0
ppo_epochs: 1
shuffle: False
Expand Down
2 changes: 1 addition & 1 deletion examples/grpo_sciworld/sciworld.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ algorithm:
repeat_times: 8
model:
model_path: /PATH/TO/MODEL/
max_prompt_tokens: 4096
max_response_tokens: 16384
max_model_len: 20480
cluster:
node_num: 1
gpu_per_node: 8
Expand Down
2 changes: 1 addition & 1 deletion examples/grpo_sciworld/train_sciworld.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ actor_rollout_ref:
strategy: fsdp # This is for backward-compatibility
ppo_micro_batch_size_per_gpu: 1
use_dynamic_bsz: False
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
ppo_max_token_len_per_gpu: 16384
grad_clip: 1.0
ppo_epochs: 1
shuffle: False
Expand Down
3 changes: 1 addition & 2 deletions examples/grpo_toolcall/toolace.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,10 @@ checkpoint_root_dir: /PATH/TO/CHECKPOINT/
algorithm:
algorithm_type: grpo
repeat_times: 8

model:
model_path: /PATH/TO/MODEL/
max_prompt_tokens: 4096
max_response_tokens: 8192
max_model_len: 12288
cluster:
node_num: 1
gpu_per_node: 8
Expand Down
2 changes: 1 addition & 1 deletion examples/grpo_toolcall/train_toolace.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ actor_rollout_ref:
strategy: fsdp # This is for backward-compatibility
ppo_micro_batch_size_per_gpu: 1
use_dynamic_bsz: True # False
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
ppo_max_token_len_per_gpu: 16384
grad_clip: 1.0
ppo_epochs: 1
shuffle: False
Expand Down
2 changes: 1 addition & 1 deletion examples/grpo_webshop/train_webshop.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ actor_rollout_ref:
strategy: fsdp # This is for backward-compatibility
ppo_micro_batch_size_per_gpu: 1
use_dynamic_bsz: False
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
ppo_max_token_len_per_gpu: 16384
grad_clip: 1.0
ppo_epochs: 1
shuffle: False
Expand Down
2 changes: 1 addition & 1 deletion examples/grpo_webshop/webshop.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ algorithm:
repeat_times: 8
model:
model_path: /PATH/TO/MODEL/
max_prompt_tokens: 4096
max_response_tokens: 16384
max_model_len: 20480
cluster:
node_num: 1
gpu_per_node: 8
Expand Down
2 changes: 1 addition & 1 deletion examples/mix_math/mix_math.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ algorithm:
read_batch_size_usual: 192
model:
model_path: /PATH/TO/MODEL/
max_prompt_tokens: 1024
max_response_tokens: 10240
max_model_len: 11264
cluster:
node_num: 1
gpu_per_node: 8
Expand Down
2 changes: 1 addition & 1 deletion examples/mix_math/train_mix_math.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ actor_rollout_ref:
strategy: fsdp # This is for backward-compatibility
ppo_micro_batch_size_per_gpu: 4
use_dynamic_bsz: True # False
ppo_max_token_len_per_gpu: 25600 # n * ${data.max_prompt_length} + ${data.max_response_length}
ppo_max_token_len_per_gpu: 25600
grad_clip: 1.0
ppo_epochs: 1
shuffle: False
Expand Down
2 changes: 1 addition & 1 deletion examples/opmd_gsm8k/opmd_gsm8k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ algorithm:
repeat_times: 8
model:
model_path: /PATH/TO/MODEL/
max_prompt_tokens: 512
max_response_tokens: 512
max_model_len: 1024
cluster:
node_num: 1
gpu_per_node: 8
Expand Down
2 changes: 1 addition & 1 deletion examples/opmd_gsm8k/train_opmd_gsm8k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ actor_rollout_ref:
strategy: fsdp # This is for backward-compatibility
ppo_micro_batch_size_per_gpu: 4
use_dynamic_bsz: True
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
ppo_max_token_len_per_gpu: 16384
grad_clip: 1.0
ppo_epochs: 1
shuffle: False
Expand Down
2 changes: 1 addition & 1 deletion examples/ppo_countdown/countdown.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ algorithm:
repeat_times: 5
model:
model_path: '/PATH/TO/MODEL/CHECKPOINT/'
max_prompt_tokens: 256
max_response_tokens: 1024
max_model_len: 1280
cluster:
node_num: 1
gpu_per_node: 8
Expand Down
2 changes: 1 addition & 1 deletion examples/ppo_countdown/train_countdown.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ actor_rollout_ref:
strategy: fsdp # This is for backward-compatibility
ppo_micro_batch_size_per_gpu: 4
use_dynamic_bsz: True
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
ppo_max_token_len_per_gpu: 16384
grad_clip: 1.0
ppo_epochs: 1
shuffle: False
Expand Down
2 changes: 1 addition & 1 deletion examples/sft_mot/sft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ algorithm:
algorithm_type: sft
model:
model_path: /PATH/TO/MODEL/
max_prompt_tokens: 512
max_response_tokens: 10240
max_model_len: 10752
cluster:
node_num: 1
gpu_per_node: 8
Expand Down
2 changes: 1 addition & 1 deletion examples/sft_mot/train_sft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ actor_rollout_ref:
strategy: fsdp # This is for backward-compatibility
ppo_micro_batch_size_per_gpu: 4
use_dynamic_bsz: True # False
ppo_max_token_len_per_gpu: 22000 # n * ${data.max_prompt_length} + ${data.max_response_length}
ppo_max_token_len_per_gpu: 22000
grad_clip: 1.0
ppo_epochs: 1
shuffle: False
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ requires-python = ">=3.10"
dependencies = [
"verl==0.4.1",
"ray[default]>=2.45.0",
"vllm>=0.9.1,<=0.9.2",
"vllm>=0.9.1,<=0.10.0",
"tensordict==0.6.2",
"wandb",
"omegaconf",
Expand Down
2 changes: 1 addition & 1 deletion tests/template/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ algorithm:

model:
model_path: ''
max_prompt_tokens: 2048
max_response_tokens: 2048
max_model_len: 4096
cluster: # 2 for explorer, 2 for trainer
node_num: 2
gpu_per_node: 2
Expand Down
2 changes: 1 addition & 1 deletion tests/template/verl_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ actor_rollout_ref:
ppo_mini_batch_size: 4
ppo_micro_batch_size_per_gpu: 1
use_dynamic_bsz: True
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
ppo_max_token_len_per_gpu: 16384
grad_clip: 1.0
ppo_epochs: 1
shuffle: False
Expand Down
Loading