agentscope-ai · pan-x-c · Aug 8, 2025 · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025
diff --git a/.github/workflows/docker/docker-compose.yaml b/.github/workflows/docker/docker-compose.yaml
@@ -8,7 +8,7 @@ services:
       - RAY_ADDRESS=auto
       - CHECKPOINT_ROOT_DIR=/mnt/checkpoints
       - DATA_ROOT_DIR=/mnt/data
-      - MODEL_PATH=/mnt/models/Qwen3-1.7B
+      - MODEL_PATH=/mnt/models/Qwen3-0.6B
       - CHECKPOINT_PATH=/mnt/checkpoints
     working_dir: /workspace
     networks:

diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md
@@ -129,14 +129,14 @@ Defines the model paths and token limits.
 model:
   model_path: /PATH/TO/MODEL/
   critic_model_path: ''
-  max_prompt_tokens: 4096
   max_response_tokens: 16384
+  max_model_len: 20480
 ```
 
 - `model_path`: Path to the model being trained.
 - `critic_model_path`: Optional path to a separate critic model. If empty, defaults to `model_path`.
-- `max_prompt_tokens`: Maximum number of tokens allowed in input prompts.
 - `max_response_tokens`: Maximum number of tokens allowed in generated responses.
+- `max_model_len`: Maximum number of tokens in a sequence.
 
 ---
 
@@ -444,7 +444,7 @@ actor_rollout_ref:
     # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True
-    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_model_len}
     grad_clip: 1.0
     ppo_epochs: 1
     shuffle: False

diff --git a/examples/async_gsm8k/explorer.yaml b/examples/async_gsm8k/explorer.yaml
@@ -7,8 +7,8 @@ algorithm:
   repeat_times: 8
 model:
   model_path: /PATH/TO/MODEL/
-  max_prompt_tokens: 256
   max_response_tokens: 1024
+  max_model_len: 1280
 cluster:
   node_num: 1
   gpu_per_node: 4

diff --git a/examples/async_gsm8k/trainer.yaml b/examples/async_gsm8k/trainer.yaml
@@ -7,8 +7,8 @@ algorithm:
   repeat_times: 8
 model:
   model_path: /PATH/TO/MODEL/
-  max_prompt_tokens: 256
   max_response_tokens: 1024
+  max_model_len: 1280
 cluster:
   node_num: 1
   gpu_per_node: 4

diff --git a/examples/async_gsm8k/verl_config.yaml b/examples/async_gsm8k/verl_config.yaml
@@ -9,7 +9,7 @@ actor_rollout_ref:
     strategy: fsdp  # This is for backward-compatibility
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True # False
-    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    ppo_max_token_len_per_gpu: 16384
     grad_clip: 1.0
     ppo_epochs: 1
     shuffle: False

diff --git a/examples/dapo_math/dapo.yaml b/examples/dapo_math/dapo.yaml
@@ -3,6 +3,8 @@ name: dapo
 checkpoint_root_dir: /PATH/TO/CHECKPOINT/
 model:
   model_path: /PATH/TO/MODEL/
+  max_response_tokens: 20480
+  max_model_len: 21504
 algorithm:
   algorithm_type: grpo
   repeat_times: 16
@@ -66,8 +68,6 @@ explorer:
     enable_prefix_caching: false
     enforce_eager: true
     dtype: bfloat16
-    max_prompt_tokens: 1024
-    max_response_tokens: 20480
     seed: 42
 synchronizer:
   sync_method: 'nccl'

diff --git a/examples/dapo_math/train_dapo.yaml b/examples/dapo_math/train_dapo.yaml
@@ -9,7 +9,7 @@ actor_rollout_ref:
     strategy: fsdp  # This is for backward-compatibility
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True # False
-    ppo_max_token_len_per_gpu: 22000 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    ppo_max_token_len_per_gpu: 22000
     grad_clip: 1.0
     ppo_epochs: 1
     shuffle: False

diff --git a/examples/dpo_humanlike/dpo.yaml b/examples/dpo_humanlike/dpo.yaml
@@ -9,8 +9,8 @@ algorithm:
 checkpoint_root_dir: /PATH/TO/CHECKPOINT/
 model:
   model_path: /PATH/TO/MODEL
-  max_prompt_tokens: 512
   max_response_tokens: 1024
+  max_model_len: 1536
 cluster:
   node_num: 1
   gpu_per_node: 8

diff --git a/examples/dpo_humanlike/train_dpo.yaml b/examples/dpo_humanlike/train_dpo.yaml
@@ -9,7 +9,7 @@ actor_rollout_ref:
     strategy: fsdp  # This is for backward-compatibility
     ppo_micro_batch_size_per_gpu: 2 # NOTE
     use_dynamic_bsz: False
-    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    ppo_max_token_len_per_gpu: 16384
     grad_clip: 1.0
     ppo_epochs: 1
     shuffle: False

diff --git a/examples/grpo_alfworld/alfworld.yaml b/examples/grpo_alfworld/alfworld.yaml
@@ -6,8 +6,8 @@ algorithm:
   repeat_times: 8
 model:
   model_path: /PATH/TO/MODEL/
-  max_prompt_tokens: 4096
   max_response_tokens: 16384
+  max_model_len: 20480
 cluster:
   node_num: 1
   gpu_per_node: 8

diff --git a/examples/grpo_alfworld/train_alfworld.yaml b/examples/grpo_alfworld/train_alfworld.yaml
@@ -9,7 +9,7 @@ actor_rollout_ref:
     strategy: fsdp  # This is for backward-compatibility
     ppo_micro_batch_size_per_gpu: 1
     use_dynamic_bsz: False
-    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    ppo_max_token_len_per_gpu: 16384
     grad_clip: 1.0
     ppo_epochs: 1
     shuffle: False

diff --git a/examples/grpo_gsm8k/gsm8k.yaml b/examples/grpo_gsm8k/gsm8k.yaml
@@ -4,11 +4,10 @@ checkpoint_root_dir: /PATH/TO/CHECKPOINT/
 algorithm:
   algorithm_type: grpo
   repeat_times: 8
-
 model:
   model_path: /PATH/TO/MODEL/
-  max_prompt_tokens: 256
   max_response_tokens: 1024
+  max_model_len: 1280
 cluster:
   node_num: 1
   gpu_per_node: 8

diff --git a/examples/grpo_gsm8k/train_gsm8k.yaml b/examples/grpo_gsm8k/train_gsm8k.yaml
@@ -9,7 +9,7 @@ actor_rollout_ref:
     strategy: fsdp  # This is for backward-compatibility
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True # False
-    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    ppo_max_token_len_per_gpu: 16384
     grad_clip: 1.0
     ppo_epochs: 1
     shuffle: False

diff --git a/examples/grpo_gsm8k_experience_pipeline/gsm8k.yaml b/examples/grpo_gsm8k_experience_pipeline/gsm8k.yaml
@@ -24,11 +24,10 @@ data_processor:
       - stats_key: 'llm_quality_score'
         op_type: ADD
         weight: 1.0
-
 model:
   model_path: /PATH/TO/MODEL/
-  max_prompt_tokens: 256
   max_response_tokens: 1024
+  max_model_len: 1280
 cluster:
   node_num: 1
   gpu_per_node: 8

diff --git a/examples/grpo_gsm8k_experience_pipeline/train_gsm8k.yaml b/examples/grpo_gsm8k_experience_pipeline/train_gsm8k.yaml
@@ -9,7 +9,7 @@ actor_rollout_ref:
     strategy: fsdp  # This is for backward-compatibility
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True # False
-    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    ppo_max_token_len_per_gpu: 16384
     grad_clip: 1.0
     ppo_epochs: 1
     shuffle: False

diff --git a/examples/grpo_gsm8k_task_pipeline/gsm8k.yaml b/examples/grpo_gsm8k_task_pipeline/gsm8k.yaml
@@ -26,11 +26,10 @@ data_processor:
     dj_process_desc: 'Please compute difficulty scores for these math questions.'
     agent_model_name: 'qwen-max'
     clean_strategy: 'iterative'
-
 model:
   model_path: /PATH/TO/MODEL/
-  max_prompt_tokens: 256
   max_response_tokens: 1024
+  max_model_len: 1280
 cluster:
   node_num: 1
   gpu_per_node: 8

diff --git a/examples/grpo_gsm8k_task_pipeline/train_gsm8k.yaml b/examples/grpo_gsm8k_task_pipeline/train_gsm8k.yaml
@@ -9,7 +9,7 @@ actor_rollout_ref:
     strategy: fsdp  # This is for backward-compatibility
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True # False
-    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    ppo_max_token_len_per_gpu: 16384
     grad_clip: 1.0
     ppo_epochs: 1
     shuffle: False

diff --git a/examples/grpo_math/math.yaml b/examples/grpo_math/math.yaml
@@ -3,6 +3,8 @@ name: grpo_math_example
 checkpoint_root_dir: /PATH/TO/CHECKPOINT/
 model:
   model_path: /PATH/TO/MODEL/
+  max_response_tokens: 3072
+  max_model_len: 4096
 algorithm:
   algorithm_type: grpo
   repeat_times: 8
@@ -44,8 +46,6 @@ explorer:
     enable_prefix_caching: false
     enforce_eager: true
     dtype: bfloat16
-    max_prompt_tokens: 1024
-    max_response_tokens: 3072
     seed: 42
 synchronizer:
   sync_method: 'nccl'

diff --git a/examples/grpo_math/train_math.yaml b/examples/grpo_math/train_math.yaml
@@ -9,7 +9,7 @@ actor_rollout_ref:
     strategy: fsdp  # This is for backward-compatibility
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True # False
-    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    ppo_max_token_len_per_gpu: 16384
     grad_clip: 1.0
     ppo_epochs: 1
     shuffle: False

diff --git a/examples/grpo_sciworld/sciworld.yaml b/examples/grpo_sciworld/sciworld.yaml
@@ -6,8 +6,8 @@ algorithm:
   repeat_times: 8
 model:
   model_path: /PATH/TO/MODEL/
-  max_prompt_tokens: 4096
   max_response_tokens: 16384
+  max_model_len: 20480
 cluster:
   node_num: 1
   gpu_per_node: 8

diff --git a/examples/grpo_sciworld/train_sciworld.yaml b/examples/grpo_sciworld/train_sciworld.yaml
@@ -9,7 +9,7 @@ actor_rollout_ref:
     strategy: fsdp  # This is for backward-compatibility
     ppo_micro_batch_size_per_gpu: 1
     use_dynamic_bsz: False
-    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    ppo_max_token_len_per_gpu: 16384
     grad_clip: 1.0
     ppo_epochs: 1
     shuffle: False

diff --git a/examples/grpo_toolcall/toolace.yaml b/examples/grpo_toolcall/toolace.yaml
@@ -4,11 +4,10 @@ checkpoint_root_dir: /PATH/TO/CHECKPOINT/
 algorithm:
   algorithm_type: grpo
   repeat_times: 8
-
 model:
   model_path: /PATH/TO/MODEL/
-  max_prompt_tokens: 4096
   max_response_tokens: 8192
+  max_model_len: 12288
 cluster:
   node_num: 1
   gpu_per_node: 8

diff --git a/examples/grpo_toolcall/train_toolace.yaml b/examples/grpo_toolcall/train_toolace.yaml
@@ -9,7 +9,7 @@ actor_rollout_ref:
     strategy: fsdp  # This is for backward-compatibility
     ppo_micro_batch_size_per_gpu: 1
     use_dynamic_bsz: True # False
-    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    ppo_max_token_len_per_gpu: 16384
     grad_clip: 1.0
     ppo_epochs: 1
     shuffle: False

diff --git a/examples/grpo_webshop/train_webshop.yaml b/examples/grpo_webshop/train_webshop.yaml
@@ -9,7 +9,7 @@ actor_rollout_ref:
     strategy: fsdp  # This is for backward-compatibility
     ppo_micro_batch_size_per_gpu: 1
     use_dynamic_bsz: False
-    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    ppo_max_token_len_per_gpu: 16384
     grad_clip: 1.0
     ppo_epochs: 1
     shuffle: False

diff --git a/examples/grpo_webshop/webshop.yaml b/examples/grpo_webshop/webshop.yaml
@@ -6,8 +6,8 @@ algorithm:
   repeat_times: 8
 model:
   model_path: /PATH/TO/MODEL/
-  max_prompt_tokens: 4096
   max_response_tokens: 16384
+  max_model_len: 20480
 cluster:
   node_num: 1
   gpu_per_node: 8

diff --git a/examples/mix_math/mix_math.yaml b/examples/mix_math/mix_math.yaml
@@ -19,8 +19,8 @@ algorithm:
     read_batch_size_usual: 192
 model:
   model_path: /PATH/TO/MODEL/
-  max_prompt_tokens: 1024
   max_response_tokens: 10240
+  max_model_len: 11264
 cluster:
   node_num: 1
   gpu_per_node: 8

diff --git a/examples/mix_math/train_mix_math.yaml b/examples/mix_math/train_mix_math.yaml
@@ -9,7 +9,7 @@ actor_rollout_ref:
     strategy: fsdp  # This is for backward-compatibility
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True # False
-    ppo_max_token_len_per_gpu: 25600 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    ppo_max_token_len_per_gpu: 25600
     grad_clip: 1.0
     ppo_epochs: 1
     shuffle: False

diff --git a/examples/opmd_gsm8k/opmd_gsm8k.yaml b/examples/opmd_gsm8k/opmd_gsm8k.yaml
@@ -6,8 +6,8 @@ algorithm:
   repeat_times: 8
 model:
   model_path: /PATH/TO/MODEL/
-  max_prompt_tokens: 512
   max_response_tokens: 512
+  max_model_len: 1024
 cluster:
   node_num: 1
   gpu_per_node: 8

diff --git a/examples/opmd_gsm8k/train_opmd_gsm8k.yaml b/examples/opmd_gsm8k/train_opmd_gsm8k.yaml
@@ -33,7 +33,7 @@ actor_rollout_ref:
     strategy: fsdp  # This is for backward-compatibility
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True
-    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    ppo_max_token_len_per_gpu: 16384
     grad_clip: 1.0
     ppo_epochs: 1
     shuffle: False

diff --git a/examples/ppo_countdown/countdown.yaml b/examples/ppo_countdown/countdown.yaml
@@ -6,8 +6,8 @@ algorithm:
   repeat_times: 5
 model:
   model_path: '/PATH/TO/MODEL/CHECKPOINT/'
-  max_prompt_tokens: 256
   max_response_tokens: 1024
+  max_model_len: 1280
 cluster:
   node_num: 1
   gpu_per_node: 8

diff --git a/examples/ppo_countdown/train_countdown.yaml b/examples/ppo_countdown/train_countdown.yaml
@@ -9,7 +9,7 @@ actor_rollout_ref:
     strategy: fsdp  # This is for backward-compatibility
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True
-    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    ppo_max_token_len_per_gpu: 16384
     grad_clip: 1.0
     ppo_epochs: 1
     shuffle: False

diff --git a/examples/sft_mot/sft.yaml b/examples/sft_mot/sft.yaml
@@ -6,8 +6,8 @@ algorithm:
   algorithm_type: sft
 model:
   model_path: /PATH/TO/MODEL/
-  max_prompt_tokens: 512
   max_response_tokens: 10240
+  max_model_len: 10752
 cluster:
   node_num: 1
   gpu_per_node: 8

diff --git a/examples/sft_mot/train_sft.yaml b/examples/sft_mot/train_sft.yaml
@@ -9,7 +9,7 @@ actor_rollout_ref:
     strategy: fsdp  # This is for backward-compatibility
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True # False
-    ppo_max_token_len_per_gpu: 22000 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    ppo_max_token_len_per_gpu: 22000
     grad_clip: 1.0
     ppo_epochs: 1
     shuffle: False

diff --git a/pyproject.toml b/pyproject.toml
@@ -23,7 +23,7 @@ requires-python = ">=3.10"
 dependencies = [
     "verl==0.4.1",
     "ray[default]>=2.45.0",
-    "vllm>=0.9.1,<=0.9.2",
+    "vllm>=0.9.1,<=0.10.0",
     "tensordict==0.6.2",
     "wandb",
     "omegaconf",

diff --git a/tests/template/config.yaml b/tests/template/config.yaml
@@ -17,8 +17,8 @@ algorithm:
 
 model:
   model_path: ''
-  max_prompt_tokens: 2048
   max_response_tokens: 2048
+  max_model_len: 4096
 cluster:  # 2 for explorer, 2 for trainer
   node_num: 2
   gpu_per_node: 2

diff --git a/tests/template/verl_config.yaml b/tests/template/verl_config.yaml
@@ -10,7 +10,7 @@ actor_rollout_ref:
     ppo_mini_batch_size: 4
     ppo_micro_batch_size_per_gpu: 1
     use_dynamic_bsz: True
-    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    ppo_max_token_len_per_gpu: 16384
     grad_clip: 1.0
     ppo_epochs: 1
     shuffle: False