diff --git a/benchmark/config/countdown-template.yaml b/benchmark/config/countdown-template.yaml index edd231cc4e..d10eb33573 100644 --- a/benchmark/config/countdown-template.yaml +++ b/benchmark/config/countdown-template.yaml @@ -35,11 +35,9 @@ buffer: rollout_args: temperature: 1.0 logprobs: 0 + default_workflow_type: math_workflow + default_reward_fn_type: countdown_reward eval_tasksets: [] - default_workflow_type: math_workflow - default_reward_fn_type: countdown_reward - system_prompt: null - reply_prefix: null trainer_input: experience_buffer: name: experience_buffer diff --git a/benchmark/config/gsm8k-template.yaml b/benchmark/config/gsm8k-template.yaml index 7a76de2e8a..93f42166a6 100644 --- a/benchmark/config/gsm8k-template.yaml +++ b/benchmark/config/gsm8k-template.yaml @@ -40,11 +40,9 @@ buffer: rollout_args: temperature: 1.0 logprobs: 0 + default_workflow_type: math_workflow + default_reward_fn_type: math_reward eval_tasksets: [] - default_workflow_type: math_workflow - default_reward_fn_type: math_reward - system_prompt: null - reply_prefix: null trainer_input: experience_buffer: name: experience_buffer @@ -79,7 +77,7 @@ trainer: enable_preview: true grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 10240 + max_token_len_per_gpu: 10240 ulysses_sequence_parallel_size: 1 monitor: monitor_type: wandb diff --git a/docs/sphinx_doc/source/tutorial/example_async_mode.md b/docs/sphinx_doc/source/tutorial/example_async_mode.md index 7cf42f3621..64e692fb86 100644 --- a/docs/sphinx_doc/source/tutorial/example_async_mode.md +++ b/docs/sphinx_doc/source/tutorial/example_async_mode.md @@ -39,14 +39,14 @@ buffer: response_key: 'answer' rollout_args: temperature: 1.0 - default_workflow_type: 'math_workflow' + default_workflow_type: 'math_workflow' trainer_input: experience_buffer: name: gsm8k_buffer storage_type: queue path: 'sqlite:///gsm8k.db' explorer: - runner_num: 32 + runner_per_model: 8 rollout_model: engine_num: 4 synchronizer: @@ -86,7 +86,7 @@ buffer: response_key: 'answer' rollout_args: temperature: 1.0 - default_workflow_type: 'math_workflow' + default_workflow_type: 'math_workflow' trainer_input: experience_buffer: name: gsm8k_buffer @@ -98,7 +98,7 @@ synchronizer: trainer: grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 ``` @@ -133,7 +133,7 @@ cluster: # important gpu_per_node: 8 explorer: name: 'explorer_new' # important - runner_num: 64 + runner_per_model: 8 rollout_model: engine_num: 8 buffer: @@ -150,7 +150,7 @@ buffer: response_key: 'answer' rollout_args: temperature: 1.0 - default_workflow_type: 'math_workflow' + default_workflow_type: 'math_workflow' trainer_input: experience_buffer: name: gsm8k_buffer diff --git a/docs/sphinx_doc/source/tutorial/example_reasoning_basic.md b/docs/sphinx_doc/source/tutorial/example_reasoning_basic.md index d20ed8bddc..cfa5d078d2 100644 --- a/docs/sphinx_doc/source/tutorial/example_reasoning_basic.md +++ b/docs/sphinx_doc/source/tutorial/example_reasoning_basic.md @@ -77,6 +77,7 @@ buffer: response_key: 'answer' rollout_args: temperature: 1.0 + default_workflow_type: 'math_workflow' eval_tasksets: - name: gsm8k-eval storage_type: file @@ -86,7 +87,7 @@ buffer: format: prompt_key: 'question' response_key: 'answer' - default_workflow_type: 'math_workflow' + default_workflow_type: 'math_workflow' trainer_input: experience_buffer: name: gsm8k_buffer @@ -94,7 +95,7 @@ buffer: path: 'sqlite:///gsm8k.db' explorer: eval_interval: 50 - runner_num: 16 + runner_per_model: 16 rollout_model: engine_num: 1 synchronizer: @@ -117,7 +118,7 @@ trinity run --config examples/grpo_gsm8k/gsm8k.yaml ## Optional: RFT with SFT Warmup -Before RFT, we may use SFT as a warmup step. Trinity-RFT supports adding SFT warmup stage before RFT by setting `stages` in the config file. The `sft_warmup_dataset` specifies the dataset used for SFT warmup, and `sft_warmup_steps` specifies the number of training steps for SFT warmup. +Before RFT, we may use SFT as a warmup step. Trinity-RFT supports adding SFT warmup stage before RFT by setting `stages` in the config file. The `experience_buffer` specifies the dataset used for SFT warmup, and `total_steps` specifies the number of training steps for SFT warmup. ```yaml # Properly add the following configs in gsm8k.yaml diff --git a/docs/sphinx_doc/source/tutorial/example_step_wise.md b/docs/sphinx_doc/source/tutorial/example_step_wise.md index 9c073168e4..3239bbaf70 100644 --- a/docs/sphinx_doc/source/tutorial/example_step_wise.md +++ b/docs/sphinx_doc/source/tutorial/example_step_wise.md @@ -121,7 +121,7 @@ buffer: workflow_args: max_env_steps: 30 enable_progress_bar: false - default_workflow_type: 'step_wise_alfworld_workflow' + default_workflow_type: 'step_wise_alfworld_workflow' trainer_input: experience_buffer: name: alfworld_buffer @@ -129,7 +129,7 @@ buffer: use_priority_queue: true explorer: max_repeat_times_per_runner: 1 - runner_num: 32 + runner_per_model: 32 max_timeout: 3600 rollout_model: enable_history: true @@ -152,7 +152,7 @@ trainer: save_interval: 50 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 ``` diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md index feec65a947..c1af4871b7 100644 --- a/docs/sphinx_doc/source/tutorial/trinity_configs.md +++ b/docs/sphinx_doc/source/tutorial/trinity_configs.md @@ -200,6 +200,7 @@ buffer: batch_size: 32 train_batch_size: 256 total_epochs: 100 + total_steps: null explorer_input: taskset: @@ -214,9 +215,6 @@ buffer: ... buffer_2: ... - - default_workflow_type: 'math_workflow' - default_reward_fn_type: 'countdown_reward' ``` - `batch_size`: Number of tasks used per training step. *Please do not multiply this value by the `algorithm.repeat_times` manually*. @@ -231,6 +229,9 @@ Defines the dataset(s) used by the explorer for training and evaluation. ```yaml buffer: explorer_input: + default_workflow_type: 'math_workflow' + default_eval_workflow_type: 'math_workflow' + default_reward_fn_type: 'countdown_reward' taskset: name: countdown_train storage_type: file @@ -262,7 +263,10 @@ buffer: ``` - `buffer.explorer_input.taskset`: Task dataset used for training exploration policies. -- `buffer.explorer_input.eval_taskset`: List of task datasets used for evaluation. +- `buffer.explorer_input.eval_tasksets`: List of task datasets used for evaluation. +- `buffer.explorer_input.default_workflow_type`: Default workflow type for all task datasets under `explorer_input` if not specified at the dataset level. +- `buffer.explorer_input.default_eval_workflow_type`: Default evaluation workflow type for all eval task datasets under `explorer_input` if not specified at the dataset level. +- `buffer.explorer_input.default_reward_fn_type`: Default reward function type for all task datasets under `explorer_input` if not specified at the dataset level. The configuration for each task dataset is defined as follows: @@ -413,7 +417,7 @@ trainer: save_strategy: "unrestricted" grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 trainer_config: null ``` @@ -429,7 +433,7 @@ trainer: - `unrestricted`: No restrictions on saving operations; multiple nodes, processes, or threads are allowed to save the model simultaneously. - `grad_clip`: Gradient clipping for updates. - `use_dynamic_bsz`: Whether to use dynamic batch size. -- `ppo_max_token_len_per_gpu`: The maximum number of tokens to be processed in forward and backward when updating the policy. Effective when `use_dynamic_bsz=true`. +- `max_token_len_per_gpu`: The maximum number of tokens to be processed in forward and backward when updating the policy. Effective when `use_dynamic_bsz=true`. - `ulysses_sequence_parallel_size`: Sequence parallel size. - `trainer_config`: The trainer configuration provided inline. --- diff --git a/docs/sphinx_doc/source_zh/tutorial/example_async_mode.md b/docs/sphinx_doc/source_zh/tutorial/example_async_mode.md index 7f1cb27301..00139883f3 100644 --- a/docs/sphinx_doc/source_zh/tutorial/example_async_mode.md +++ b/docs/sphinx_doc/source_zh/tutorial/example_async_mode.md @@ -39,14 +39,14 @@ buffer: response_key: 'answer' rollout_args: temperature: 1.0 - default_workflow_type: 'math_workflow' + default_workflow_type: 'math_workflow' trainer_input: experience_buffer: name: gsm8k_buffer storage_type: queue path: 'sqlite:///gsm8k.db' explorer: - runner_num: 32 + runner_per_model: 16 rollout_model: engine_num: 4 synchronizer: @@ -86,7 +86,7 @@ buffer: response_key: 'answer' rollout_args: temperature: 1.0 - default_workflow_type: 'math_workflow' + default_workflow_type: 'math_workflow' trainer_input: experience_buffer: name: gsm8k_buffer @@ -133,7 +133,7 @@ cluster: # important gpu_per_node: 8 explorer: name: 'explorer_new' # important - runner_num: 64 + runner_per_model: 8 rollout_model: engine_num: 8 buffer: @@ -150,7 +150,7 @@ buffer: response_key: 'answer' rollout_args: temperature: 1.0 - default_workflow_type: 'math_workflow' + default_workflow_type: 'math_workflow' trainer_input: experience_buffer: name: gsm8k_buffer diff --git a/docs/sphinx_doc/source_zh/tutorial/example_reasoning_basic.md b/docs/sphinx_doc/source_zh/tutorial/example_reasoning_basic.md index 639d07d89f..3235930063 100644 --- a/docs/sphinx_doc/source_zh/tutorial/example_reasoning_basic.md +++ b/docs/sphinx_doc/source_zh/tutorial/example_reasoning_basic.md @@ -77,6 +77,7 @@ buffer: response_key: 'answer' rollout_args: temperature: 1.0 + default_workflow_type: 'math_workflow' eval_tasksets: - name: gsm8k-eval storage_type: file @@ -86,7 +87,7 @@ buffer: format: prompt_key: 'question' response_key: 'answer' - default_workflow_type: 'math_workflow' + default_workflow_type: 'math_workflow' trainer_input: experience_buffer: name: gsm8k_buffer @@ -94,7 +95,7 @@ buffer: path: 'sqlite:///gsm8k.db' explorer: eval_interval: 50 - runner_num: 16 + runner_per_model: 16 rollout_model: engine_num: 1 synchronizer: @@ -117,7 +118,7 @@ trinity run --config examples/grpo_gsm8k/gsm8k.yaml ## 进阶选项:带 SFT warmup 的 RFT -在进行 RFT 之前,我们可以先使用 SFT 作为预热步骤。Trinity-RFT 支持通过在配置文件中设置 `stages` 来添加 SFT 预热阶段。`sft_warmup_dataset` 指定用于 SFT warmup 的数据集,`sft_warmup_steps` 指定 SFT warmup 的训练步数。 +在进行 RFT 之前,我们可以先使用 SFT 作为预热步骤。Trinity-RFT 支持通过在配置文件中设置 `stages` 来添加 SFT 预热阶段。`experience_buffer` 指定用于 SFT warmup 的数据集,`total_steps` 指定 SFT warmup 的训练步数。 ```yaml # 在 gsm8k.yaml 中正确添加以下配置 diff --git a/docs/sphinx_doc/source_zh/tutorial/example_step_wise.md b/docs/sphinx_doc/source_zh/tutorial/example_step_wise.md index a0cc1b231e..f40250f5e3 100644 --- a/docs/sphinx_doc/source_zh/tutorial/example_step_wise.md +++ b/docs/sphinx_doc/source_zh/tutorial/example_step_wise.md @@ -119,7 +119,7 @@ buffer: workflow_args: max_env_steps: 30 enable_progress_bar: false - default_workflow_type: 'step_wise_alfworld_workflow' + default_workflow_type: 'step_wise_alfworld_workflow' trainer_input: experience_buffer: name: alfworld_buffer @@ -127,7 +127,7 @@ buffer: use_priority_queue: true explorer: max_repeat_times_per_runner: 1 - runner_num: 32 + runner_per_model: 16 max_timeout: 3600 rollout_model: enable_history: true @@ -150,7 +150,7 @@ trainer: save_interval: 50 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 ``` diff --git a/docs/sphinx_doc/source_zh/tutorial/trinity_configs.md b/docs/sphinx_doc/source_zh/tutorial/trinity_configs.md index b2b2dde92b..998fe939e8 100644 --- a/docs/sphinx_doc/source_zh/tutorial/trinity_configs.md +++ b/docs/sphinx_doc/source_zh/tutorial/trinity_configs.md @@ -214,9 +214,6 @@ buffer: ... buffer_2: ... - - default_workflow_type: 'math_workflow' - default_reward_fn_type: 'countdown_reward' ``` - `batch_size`: 每个训练步骤使用的任务数。*请勿手动将此值乘以 `algorithm.repeat_times`*。 @@ -231,6 +228,9 @@ buffer: ```yaml buffer: explorer_input: + default_workflow_type: 'math_workflow' + default_eval_workflow_type: 'math_workflow' + default_reward_fn_type: 'countdown_reward' taskset: name: countdown_train storage_type: file @@ -256,13 +256,14 @@ buffer: response_key: 'answer' rollout_args: temperature: 0.1 - default_workflow_type: 'math_workflow' - default_reward_fn_type: 'countdown_reward' ... ``` - `buffer.explorer_input.taskset`: 用于训练探索策略的任务数据集。 -- `buffer.explorer_input.eval_taskset`: 用于评估的任务数据集列表。 +- `buffer.explorer_input.eval_tasksets`: 用于评测的任务数据集列表。 +- `buffer.explorer_input.default_workflow_type`: 若未在数据集级别指定,则为所有任务数据集设置默认的工作流类型。 +- `buffer.explorer_input.default_eval_workflow_type`: 若未在数据集级别指定,则为所有评测任务数据集设置默认的工作流类型。 +- `buffer.explorer_input.default_reward_fn_type`: 若未在数据集级别指定,则为所有任务数据集设置默认的奖励类型。 每个任务数据集的配置定义如下: @@ -413,7 +414,7 @@ trainer: save_strategy: "unrestricted" grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 trainer_config: null ``` @@ -429,7 +430,7 @@ trainer: - `unrestricted`:不限制保存操作,允许多个节点、进程或线程同时保存模型。 - `grad_clip`: 梯度裁剪阈值。 - `use_dynamic_bsz`: 是否使用动态批量大小。 -- `ppo_max_token_len_per_gpu`: 训练过程中,每个 GPU 最大 token 长度; 当 `use_dynamic_bsz=true` 时生效。 +- `max_token_len_per_gpu`: 训练过程中,每个 GPU 最大 token 长度; 当 `use_dynamic_bsz=true` 时生效。 - `ulysses_sequence_parallel_size`: 序列并行的并行度,即用于分割单个序列的 GPU 数量。 - `trainer_config`: 内联提供的 trainer 配置。 diff --git a/examples/RAFT_alfworld/RAFT_alfworld_7B.yaml b/examples/RAFT_alfworld/RAFT_alfworld_7B.yaml index 646de28a0a..7fc2445eaf 100644 --- a/examples/RAFT_alfworld/RAFT_alfworld_7B.yaml +++ b/examples/RAFT_alfworld/RAFT_alfworld_7B.yaml @@ -70,7 +70,7 @@ trainer: save_interval: 100000 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 20000 # Adjusted for alfworld longer sequences + max_token_len_per_gpu: 20000 # Adjusted for alfworld longer sequences ulysses_sequence_parallel_size: 1 monitor: monitor_type: wandb diff --git a/examples/RAFT_alfworld/RAFT_reflect_alfworld_7B.yaml b/examples/RAFT_alfworld/RAFT_reflect_alfworld_7B.yaml index 1a98e4f58e..30a115cada 100644 --- a/examples/RAFT_alfworld/RAFT_reflect_alfworld_7B.yaml +++ b/examples/RAFT_alfworld/RAFT_reflect_alfworld_7B.yaml @@ -70,7 +70,7 @@ trainer: save_interval: 100000 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 20000 # Adjusted for alfworld longer sequences + max_token_len_per_gpu: 20000 # Adjusted for alfworld longer sequences ulysses_sequence_parallel_size: 1 monitor: monitor_type: wandb diff --git a/examples/agentscope_react/gsm8k.yaml b/examples/agentscope_react/gsm8k.yaml index 1cb61b3b89..a70da2c5e1 100644 --- a/examples/agentscope_react/gsm8k.yaml +++ b/examples/agentscope_react/gsm8k.yaml @@ -29,8 +29,8 @@ buffer: response_key: 'answer' rollout_args: temperature: 1.0 + default_workflow_type: 'as_react_workflow' eval_tasksets: [] - default_workflow_type: 'as_react_workflow' trainer_input: experience_buffer: name: agentscope_gsm8k_buffer @@ -61,7 +61,7 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 24576 + max_token_len_per_gpu: 24576 ulysses_sequence_parallel_size: 2 monitor: monitor_type: tensorboard diff --git a/examples/agentscope_tool_react/agentscopev0_tool_react_dapo.yaml b/examples/agentscope_tool_react/agentscopev0_tool_react_dapo.yaml index 23efc27c12..9463de670b 100644 --- a/examples/agentscope_tool_react/agentscopev0_tool_react_dapo.yaml +++ b/examples/agentscope_tool_react/agentscopev0_tool_react_dapo.yaml @@ -29,8 +29,8 @@ buffer: response_key: 'solution' rollout_args: temperature: 1.0 + default_workflow_type: 'agentscopev0_react_math_workflow' eval_tasksets: [] - default_workflow_type: 'agentscopev0_react_math_workflow' trainer_input: experience_buffer: name: agentscope_dapo_buffer @@ -62,7 +62,7 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 24576 + max_token_len_per_gpu: 24576 ulysses_sequence_parallel_size: 2 monitor: monitor_type: wandb diff --git a/examples/agentscope_tool_react/agentscopev0_tool_react_gsm8k.yaml b/examples/agentscope_tool_react/agentscopev0_tool_react_gsm8k.yaml index 76c946edfe..e0f73db05b 100644 --- a/examples/agentscope_tool_react/agentscopev0_tool_react_gsm8k.yaml +++ b/examples/agentscope_tool_react/agentscopev0_tool_react_gsm8k.yaml @@ -29,8 +29,8 @@ buffer: response_key: 'answer' rollout_args: temperature: 1.0 + default_workflow_type: 'agentscopev0_react_math_workflow' eval_tasksets: [] - default_workflow_type: 'agentscopev0_react_math_workflow' trainer_input: experience_buffer: name: agentscope_gsm8k_buffer @@ -62,7 +62,7 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 24576 + max_token_len_per_gpu: 24576 ulysses_sequence_parallel_size: 2 monitor: monitor_type: wandb diff --git a/examples/agentscope_tool_react/agentscopev1_tool_react_dapo.yaml b/examples/agentscope_tool_react/agentscopev1_tool_react_dapo.yaml index e0bff08b66..3a951a66d4 100644 --- a/examples/agentscope_tool_react/agentscopev1_tool_react_dapo.yaml +++ b/examples/agentscope_tool_react/agentscopev1_tool_react_dapo.yaml @@ -29,8 +29,8 @@ buffer: response_key: 'solution' rollout_args: temperature: 1.0 + default_workflow_type: 'agentscope_react_math_workflow' eval_tasksets: [] - default_workflow_type: 'agentscope_react_math_workflow' trainer_input: experience_buffer: name: agentscope_dapo_buffer @@ -60,7 +60,7 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 24576 + max_token_len_per_gpu: 24576 ulysses_sequence_parallel_size: 2 monitor: monitor_type: wandb diff --git a/examples/agentscope_websearch/agentscopev1_websearch_agent.yaml b/examples/agentscope_websearch/agentscopev1_websearch_agent.yaml index bb6d9b34fe..cd9a87594a 100644 --- a/examples/agentscope_websearch/agentscopev1_websearch_agent.yaml +++ b/examples/agentscope_websearch/agentscopev1_websearch_agent.yaml @@ -33,6 +33,7 @@ buffer: temperature: 1.0 max_tokens: 4096 enable_progress_bar: false + default_workflow_type: 'agentscope_v1_react_search_workflow' eval_tasksets: - name: webqa_test storage_type: file @@ -47,7 +48,7 @@ buffer: rollout_args: temperature: 0.6 max_tokens: 4096 - default_workflow_type: 'agentscope_v1_react_search_workflow' + default_workflow_type: 'agentscope_v1_react_search_workflow' trainer_input: experience_buffer: name: experience_buffer @@ -88,5 +89,5 @@ trainer: save_interval: 20 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 2 diff --git a/examples/asymre_gsm8k/gsm8k.yaml b/examples/asymre_gsm8k/gsm8k.yaml index 16ac2b31bb..0374dd6fcb 100644 --- a/examples/asymre_gsm8k/gsm8k.yaml +++ b/examples/asymre_gsm8k/gsm8k.yaml @@ -36,6 +36,7 @@ buffer: response_key: answer rollout_args: temperature: 1.0 + default_workflow_type: math_workflow eval_tasksets: - name: gsm8k-eval storage_type: file @@ -45,7 +46,7 @@ buffer: format: prompt_key: question response_key: answer - default_workflow_type: math_workflow + default_workflow_type: math_workflow trainer_input: experience_buffer: name: gsm8k_buffer @@ -69,5 +70,5 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 diff --git a/examples/asymre_math/math.yaml b/examples/asymre_math/math.yaml index 97df16cebb..c1636f5318 100644 --- a/examples/asymre_math/math.yaml +++ b/examples/asymre_math/math.yaml @@ -73,5 +73,5 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 diff --git a/examples/async_gsm8k/explorer.yaml b/examples/async_gsm8k/explorer.yaml index 07c21ef043..ab49d2ffab 100644 --- a/examples/async_gsm8k/explorer.yaml +++ b/examples/async_gsm8k/explorer.yaml @@ -28,7 +28,7 @@ buffer: rollout_args: temperature: 1.0 logprobs: 0 - default_workflow_type: 'math_workflow' + default_workflow_type: 'math_workflow' trainer_input: experience_buffer: name: gsm8k_buffer diff --git a/examples/async_gsm8k/trainer.yaml b/examples/async_gsm8k/trainer.yaml index 05430aebe6..9791eeeeec 100644 --- a/examples/async_gsm8k/trainer.yaml +++ b/examples/async_gsm8k/trainer.yaml @@ -30,7 +30,7 @@ buffer: rollout_args: temperature: 1.0 logprobs: 0 - default_workflow_type: 'math_workflow' + default_workflow_type: 'math_workflow' trainer_input: experience_buffer: name: gsm8k_buffer @@ -42,5 +42,5 @@ synchronizer: trainer: grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 diff --git a/examples/cispo_gsm8k/gsm8k.yaml b/examples/cispo_gsm8k/gsm8k.yaml index 2a038dbeec..6d6dffaaa2 100644 --- a/examples/cispo_gsm8k/gsm8k.yaml +++ b/examples/cispo_gsm8k/gsm8k.yaml @@ -61,5 +61,5 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 diff --git a/examples/dapo_math/dapo.yaml b/examples/dapo_math/dapo.yaml index 07df6557d7..3acf4d817e 100644 --- a/examples/dapo_math/dapo.yaml +++ b/examples/dapo_math/dapo.yaml @@ -77,5 +77,5 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 22000 + max_token_len_per_gpu: 22000 ulysses_sequence_parallel_size: 1 diff --git a/examples/dpo_human_in_the_loop/dpo.yaml b/examples/dpo_human_in_the_loop/dpo.yaml index bc4a015261..f13dfc539e 100644 --- a/examples/dpo_human_in_the_loop/dpo.yaml +++ b/examples/dpo_human_in_the_loop/dpo.yaml @@ -77,7 +77,7 @@ trainer: save_interval: 30 total_steps: 200 use_dynamic_bsz: false - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 trainer_config: actor_rollout_ref: diff --git a/examples/dpo_humanlike/dpo.yaml b/examples/dpo_humanlike/dpo.yaml index bd5bb7f118..7f07d1a226 100644 --- a/examples/dpo_humanlike/dpo.yaml +++ b/examples/dpo_humanlike/dpo.yaml @@ -42,7 +42,7 @@ trainer: save_interval: 30 total_steps: 200 use_dynamic_bsz: false - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 trainer_config: actor_rollout_ref: diff --git a/examples/grpo_alfworld/alfworld.yaml b/examples/grpo_alfworld/alfworld.yaml index bb6733d391..77ba65d555 100644 --- a/examples/grpo_alfworld/alfworld.yaml +++ b/examples/grpo_alfworld/alfworld.yaml @@ -26,7 +26,7 @@ buffer: rollout_args: temperature: 1.0 logprobs: 0 - default_workflow_type: 'alfworld_workflow' + default_workflow_type: 'alfworld_workflow' trainer_input: experience_buffer: name: alfworld_buffer @@ -54,5 +54,5 @@ trainer: save_interval: 10 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 diff --git a/examples/grpo_alfworld_general_multi_step/alfworld.yaml b/examples/grpo_alfworld_general_multi_step/alfworld.yaml index 806f515ba4..e36016a3b2 100644 --- a/examples/grpo_alfworld_general_multi_step/alfworld.yaml +++ b/examples/grpo_alfworld_general_multi_step/alfworld.yaml @@ -30,7 +30,7 @@ buffer: workflow_args: max_env_steps: 30 enable_progress_bar: false - default_workflow_type: 'step_wise_alfworld_workflow' + default_workflow_type: 'step_wise_alfworld_workflow' trainer_input: experience_buffer: name: alfworld_buffer @@ -61,7 +61,7 @@ trainer: save_interval: 50 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 monitor: monitor_type: 'wandb' diff --git a/examples/grpo_email_search/email_search.yaml b/examples/grpo_email_search/email_search.yaml index 710316616e..fa3b96f2a5 100644 --- a/examples/grpo_email_search/email_search.yaml +++ b/examples/grpo_email_search/email_search.yaml @@ -90,5 +90,5 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 diff --git a/examples/grpo_gsm8k/gsm8k.yaml b/examples/grpo_gsm8k/gsm8k.yaml index 0fa3e2601e..bc3e533d5f 100644 --- a/examples/grpo_gsm8k/gsm8k.yaml +++ b/examples/grpo_gsm8k/gsm8k.yaml @@ -62,7 +62,7 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 # stages: # Uncomment to add a SFT warmup stage before RFT # - stage_name: sft_warmup diff --git a/examples/grpo_gsm8k_experience_pipeline/gsm8k.yaml b/examples/grpo_gsm8k_experience_pipeline/gsm8k.yaml index 3fccaaac28..62d380220a 100644 --- a/examples/grpo_gsm8k_experience_pipeline/gsm8k.yaml +++ b/examples/grpo_gsm8k_experience_pipeline/gsm8k.yaml @@ -81,5 +81,5 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 diff --git a/examples/grpo_gsm8k_ruler/gsm8k_ruler.yaml b/examples/grpo_gsm8k_ruler/gsm8k_ruler.yaml index fa45d91913..d8b9227c32 100644 --- a/examples/grpo_gsm8k_ruler/gsm8k_ruler.yaml +++ b/examples/grpo_gsm8k_ruler/gsm8k_ruler.yaml @@ -71,5 +71,5 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 diff --git a/examples/grpo_gsm8k_task_pipeline/gsm8k.yaml b/examples/grpo_gsm8k_task_pipeline/gsm8k.yaml index 76247d527f..fb95c0dc62 100644 --- a/examples/grpo_gsm8k_task_pipeline/gsm8k.yaml +++ b/examples/grpo_gsm8k_task_pipeline/gsm8k.yaml @@ -79,5 +79,5 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 diff --git a/examples/grpo_gsm8k_trainable_ruler/gsm8k_ruler.yaml b/examples/grpo_gsm8k_trainable_ruler/gsm8k_ruler.yaml index 785191dcc6..fa332954cc 100644 --- a/examples/grpo_gsm8k_trainable_ruler/gsm8k_ruler.yaml +++ b/examples/grpo_gsm8k_trainable_ruler/gsm8k_ruler.yaml @@ -48,7 +48,7 @@ buffer: storage_type: queue explorer: eval_interval: 10 - runner_num: 32 + runner_per_model: 8 rollout_model: engine_num: 4 tensor_parallel_size: 1 @@ -65,5 +65,5 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 diff --git a/examples/grpo_lora_gsm8k/gsm8k.yaml b/examples/grpo_lora_gsm8k/gsm8k.yaml index bad54c22d2..e445824415 100644 --- a/examples/grpo_lora_gsm8k/gsm8k.yaml +++ b/examples/grpo_lora_gsm8k/gsm8k.yaml @@ -65,7 +65,7 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 trainer_config: actor_rollout_ref: diff --git a/examples/grpo_math/math.yaml b/examples/grpo_math/math.yaml index 39d5cafa5b..1ec35ce86c 100644 --- a/examples/grpo_math/math.yaml +++ b/examples/grpo_math/math.yaml @@ -29,8 +29,8 @@ buffer: logprobs: 0 reward_fn_args: reward_name: math_verify_reward - default_workflow_type: 'math_rm_workflow' - default_reward_fn_type: 'rm_gallery_reward' + default_workflow_type: 'math_rm_workflow' + default_reward_fn_type: 'rm_gallery_reward' trainer_input: experience_buffer: name: math_buffer @@ -54,5 +54,5 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 diff --git a/examples/grpo_rubric_as_reward/rubric.yaml b/examples/grpo_rubric_as_reward/rubric.yaml index 4e76a9c9cd..6e66dc348f 100644 --- a/examples/grpo_rubric_as_reward/rubric.yaml +++ b/examples/grpo_rubric_as_reward/rubric.yaml @@ -31,7 +31,7 @@ buffer: rollout_args: temperature: 1.0 enable_progress_bar: false - default_workflow_type: 'rubric_judge_workflow' + default_workflow_type: 'rubric_judge_workflow' trainer_input: experience_buffer: name: experience_buffer @@ -64,5 +64,5 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 diff --git a/examples/grpo_sciworld/sciworld.yaml b/examples/grpo_sciworld/sciworld.yaml index 8fb044781f..09bf683132 100644 --- a/examples/grpo_sciworld/sciworld.yaml +++ b/examples/grpo_sciworld/sciworld.yaml @@ -26,7 +26,7 @@ buffer: rollout_args: temperature: 1.0 logprobs: 0 - default_workflow_type: 'sciworld_workflow' + default_workflow_type: 'sciworld_workflow' trainer_input: experience_buffer: name: sciworld_buffer @@ -51,5 +51,5 @@ trainer: save_interval: 10 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 diff --git a/examples/grpo_toolcall/toolace.yaml b/examples/grpo_toolcall/toolace.yaml index 7d699b4bc6..05e8a7e7e4 100644 --- a/examples/grpo_toolcall/toolace.yaml +++ b/examples/grpo_toolcall/toolace.yaml @@ -25,8 +25,7 @@ buffer: rollout_args: temperature: 1.0 logprobs: 0 - eval_tasksets: [] - default_workflow_type: 'toolcall_workflow' + default_workflow_type: 'toolcall_workflow' trainer_input: experience_buffer: name: toolace_buffer @@ -50,5 +49,5 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 2 diff --git a/examples/grpo_vlm/vlm.yaml b/examples/grpo_vlm/vlm.yaml index 75f1aa0b42..159f0177a7 100644 --- a/examples/grpo_vlm/vlm.yaml +++ b/examples/grpo_vlm/vlm.yaml @@ -30,8 +30,8 @@ buffer: image_key: 'images' rollout_args: temperature: 1.0 - default_workflow_type: 'simple_mm_workflow' - default_reward_fn_type: 'math_boxed_reward' + default_workflow_type: 'simple_mm_workflow' + default_reward_fn_type: 'math_boxed_reward' trainer_input: experience_buffer: name: experience_buffer @@ -53,5 +53,5 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 diff --git a/examples/grpo_webshop/webshop.yaml b/examples/grpo_webshop/webshop.yaml index 2183772fb9..7357002bcb 100644 --- a/examples/grpo_webshop/webshop.yaml +++ b/examples/grpo_webshop/webshop.yaml @@ -26,7 +26,7 @@ buffer: rollout_args: temperature: 1.0 logprobs: 0 - default_workflow_type: 'webshop_workflow' + default_workflow_type: 'webshop_workflow' trainer_input: experience_buffer: name: webshop_buffer @@ -51,5 +51,5 @@ trainer: save_interval: 10 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 diff --git a/examples/mix_chord/mix_chord.yaml b/examples/mix_chord/mix_chord.yaml index 77e54adb3a..7d62c559b1 100644 --- a/examples/mix_chord/mix_chord.yaml +++ b/examples/mix_chord/mix_chord.yaml @@ -85,7 +85,7 @@ trainer: save_interval: 50 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 25600 + max_token_len_per_gpu: 25600 ulysses_sequence_parallel_size: 2 monitor: monitor_type: wandb diff --git a/examples/mix_chord/mix_chord_toolace.yaml b/examples/mix_chord/mix_chord_toolace.yaml index fd8875266a..9380c82c36 100644 --- a/examples/mix_chord/mix_chord_toolace.yaml +++ b/examples/mix_chord/mix_chord_toolace.yaml @@ -80,7 +80,7 @@ trainer: save_interval: 50 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 25600 + max_token_len_per_gpu: 25600 ulysses_sequence_parallel_size: 2 monitor: monitor_type: wandb diff --git a/examples/mix_math/mix_math.yaml b/examples/mix_math/mix_math.yaml index ad6133f2c8..07b43a990f 100644 --- a/examples/mix_math/mix_math.yaml +++ b/examples/mix_math/mix_math.yaml @@ -84,7 +84,7 @@ trainer: save_interval: 50 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 25600 + max_token_len_per_gpu: 25600 ulysses_sequence_parallel_size: 1 monitor: monitor_type: wandb diff --git a/examples/opmd_gsm8k/opmd_gsm8k.yaml b/examples/opmd_gsm8k/opmd_gsm8k.yaml index acb65a86cd..6367e01a7c 100644 --- a/examples/opmd_gsm8k/opmd_gsm8k.yaml +++ b/examples/opmd_gsm8k/opmd_gsm8k.yaml @@ -28,7 +28,7 @@ buffer: rollout_args: temperature: 1.0 logprobs: 0 - default_workflow_type: 'math_workflow' + default_workflow_type: 'math_workflow' trainer_input: experience_buffer: name: gsm8k_buffer @@ -51,5 +51,5 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 diff --git a/examples/ppo_countdown/countdown.yaml b/examples/ppo_countdown/countdown.yaml index 9d792fd96c..c7bdfc9774 100644 --- a/examples/ppo_countdown/countdown.yaml +++ b/examples/ppo_countdown/countdown.yaml @@ -27,8 +27,8 @@ buffer: rollout_args: temperature: 1.0 logprobs: 0 - default_workflow_type: 'math_workflow' - default_reward_fn_type: 'countdown_reward' + default_workflow_type: 'math_workflow' + default_reward_fn_type: 'countdown_reward' trainer_input: experience_buffer: name: countdown_buffer @@ -52,7 +52,7 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 trainer_config: actor_rollout_ref: diff --git a/examples/ppo_countdown_megatron/countdown.yaml b/examples/ppo_countdown_megatron/countdown.yaml index bb47a43537..bb592d464e 100644 --- a/examples/ppo_countdown_megatron/countdown.yaml +++ b/examples/ppo_countdown_megatron/countdown.yaml @@ -26,8 +26,8 @@ buffer: response_key: 'answer' rollout_args: temperature: 1.0 - default_workflow_type: 'math_workflow' - default_reward_fn_type: 'countdown_reward' + default_workflow_type: 'math_workflow' + default_reward_fn_type: 'countdown_reward' trainer_input: experience_buffer: name: countdown_buffer @@ -51,7 +51,7 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 trainer_config: actor_rollout_ref: diff --git a/examples/rec_gsm8k/gsm8k.yaml b/examples/rec_gsm8k/gsm8k.yaml index 4c45270d13..3be850e401 100644 --- a/examples/rec_gsm8k/gsm8k.yaml +++ b/examples/rec_gsm8k/gsm8k.yaml @@ -54,9 +54,8 @@ buffer: storage_type: queue explorer: eval_interval: 20 - runner_num: 64 + runner_per_model: 16 rollout_model: - engine_type: vllm_async engine_num: 4 tensor_parallel_size: 1 enable_prefix_caching: false @@ -73,5 +72,5 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 diff --git a/examples/rec_math/math.yaml b/examples/rec_math/math.yaml index 1996d9814e..226fbe7d75 100644 --- a/examples/rec_math/math.yaml +++ b/examples/rec_math/math.yaml @@ -57,7 +57,7 @@ buffer: storage_type: queue explorer: eval_interval: 500 - runner_num: 64 + runner_per_model: 16 rollout_model: engine_type: vllm_async engine_num: 4 @@ -78,5 +78,5 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 diff --git a/examples/sft_mot/sft.yaml b/examples/sft_mot/sft.yaml index d408b3da25..159a8657e1 100644 --- a/examples/sft_mot/sft.yaml +++ b/examples/sft_mot/sft.yaml @@ -32,5 +32,5 @@ trainer: save_interval: 10 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 22000 + max_token_len_per_gpu: 22000 ulysses_sequence_parallel_size: 1 diff --git a/examples/sppo_gsm8k/gsm8k.yaml b/examples/sppo_gsm8k/gsm8k.yaml index 790c2623b1..0fb4cabcba 100644 --- a/examples/sppo_gsm8k/gsm8k.yaml +++ b/examples/sppo_gsm8k/gsm8k.yaml @@ -67,5 +67,5 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 22000 + max_token_len_per_gpu: 22000 ulysses_sequence_parallel_size: 1 diff --git a/examples/topr_gsm8k/gsm8k.yaml b/examples/topr_gsm8k/gsm8k.yaml index 12ba9a7817..2ae39139a7 100644 --- a/examples/topr_gsm8k/gsm8k.yaml +++ b/examples/topr_gsm8k/gsm8k.yaml @@ -61,5 +61,5 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 diff --git a/tests/buffer/formatter_test.py b/tests/buffer/formatter_test.py index 1e8a682b4e..92b6616555 100644 --- a/tests/buffer/formatter_test.py +++ b/tests/buffer/formatter_test.py @@ -263,8 +263,7 @@ def test_task_formatter(self): } config = StorageConfig( is_eval=True, - default_workflow_type="math_workflow", - default_eval_workflow_type="math_boxed_workflow", + default_workflow_type="math_boxed_workflow", workflow_args={"use_base": True, "with_think": True}, ) formatter = FORMATTER.get("task")(config=config) @@ -279,7 +278,6 @@ def test_task_formatter(self): config = StorageConfig( is_eval=False, default_workflow_type="math_workflow", - default_eval_workflow_type="math_boxed_workflow", default_reward_fn_type="math_reward", workflow_args={"use_base": False, "with_think": True}, ) @@ -296,7 +294,7 @@ def test_task_formatter(self): config = StorageConfig( is_eval=False, - default_eval_workflow_type="math_workflow", + default_workflow_type="math_workflow", workflow_args={"use_base": True, "with_think": False}, format=FormatConfig( workflow_key="workflow", diff --git a/tests/common/config_test.py b/tests/common/config_test.py index e51832f3a6..0a6a5557b0 100644 --- a/tests/common/config_test.py +++ b/tests/common/config_test.py @@ -1,11 +1,12 @@ # -*- coding: utf-8 -*- """Test cases for Config modules.""" import datetime +import math import os import shutil import unittest -from tests.tools import get_template_config +from tests.tools import get_template_config, get_unittest_dataset_config from trinity.common.config import InferenceModelConfig, load_config CHECKPOINT_ROOT_DIR = os.path.join(os.path.dirname(__file__), "temp_checkpoint_dir") @@ -91,6 +92,51 @@ def test_update_config_from_ray_cluster(self): self.assertEqual(config.cluster.node_num, 2) self.assertEqual(config.cluster.gpu_per_node, 2) + def test_default_workflow(self): + config = get_template_config() + config.buffer.explorer_input.default_workflow_type = "simple_workflow" + config.buffer.explorer_input.default_eval_workflow_type = "math_boxed_workflow" + config.buffer.explorer_input.eval_tasksets.append(get_unittest_dataset_config("gsm8k")) + st = get_unittest_dataset_config("countdown") + st.default_workflow_type = None + config.buffer.explorer_input.eval_tasksets.append(st) + config.check_and_update() + self.assertEqual( + config.buffer.explorer_input.eval_tasksets[0].default_workflow_type, + "math_workflow", + ) + self.assertEqual( + config.buffer.explorer_input.eval_tasksets[1].default_workflow_type, + "math_boxed_workflow", + ) + self.assertEqual( + config.buffer.explorer_input.taskset.default_workflow_type, + "simple_workflow", + ) + + def test_max_token_len_per_gpu_set_correctly(self): + config = get_template_config() + config.model.max_model_len = 8192 + config.trainer.ulysses_sequence_parallel_size = 2 + config.trainer.max_token_len_per_gpu = None + config.check_and_update() + self.assertIsNotNone(config.trainer.trainer_config) + expected_max_token_len = math.ceil( + (2 * config.model.max_model_len) / config.trainer.ulysses_sequence_parallel_size + ) + self.assertEqual( + config.trainer.trainer_config.actor_rollout_ref.actor.ppo_max_token_len_per_gpu, + expected_max_token_len, + ) + self.assertEqual( + config.trainer.trainer_config.actor_rollout_ref.ref.log_prob_max_token_len_per_gpu, + expected_max_token_len, + ) + self.assertEqual( + config.trainer.trainer_config.critic.ppo_max_token_len_per_gpu, + expected_max_token_len, + ) + def tearDown(self): if os.path.exists(CHECKPOINT_ROOT_DIR): shutil.rmtree(CHECKPOINT_ROOT_DIR) diff --git a/tests/template/config.yaml b/tests/template/config.yaml index 74587f98dd..13b2ad081f 100644 --- a/tests/template/config.yaml +++ b/tests/template/config.yaml @@ -32,9 +32,6 @@ buffer: path: 'placeholder' split: 'train' enable_progress_bar: false - default_workflow_type: '' - default_eval_workflow_type: '' - default_reward_fn_type: '' explorer: eval_interval: 100 runner_per_model: 8 @@ -51,7 +48,7 @@ trainer: save_interval: 100 grad_clip: 1.0 use_dynamic_bsz: true - ppo_max_token_len_per_gpu: 16384 + max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 trainer_config: actor_rollout_ref: @@ -64,8 +61,6 @@ trainer: lr: 1e-5 model: use_remove_padding: false - ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 - forward_max_token_len_per_gpu: ${trainer.trainer_config.critic.ppo_max_token_len_per_gpu} cliprange_value: 0.5 checkpoint: load_contents: ['model', 'optimizer', 'extra'] diff --git a/trinity/buffer/schema/formatter.py b/trinity/buffer/schema/formatter.py index e562df7d0b..4284a321ad 100644 --- a/trinity/buffer/schema/formatter.py +++ b/trinity/buffer/schema/formatter.py @@ -40,8 +40,6 @@ def __init__(self, config: StorageConfig): self.config = config self.is_eval = config.is_eval self.default_workflow_cls = WORKFLOWS.get(config.default_workflow_type) # type: ignore - if self.is_eval and config.default_eval_workflow_type: - self.default_workflow_cls = WORKFLOWS.get(config.default_eval_workflow_type) self.default_reward_fn_cls = REWARD_FUNCTIONS.get(config.default_reward_fn_type) # type: ignore self.workflow_key = config.format.workflow_key self.reward_fn_key = config.format.reward_fn_key diff --git a/trinity/buffer/storage/sql.py b/trinity/buffer/storage/sql.py index de4e34f6ff..3254cd663e 100644 --- a/trinity/buffer/storage/sql.py +++ b/trinity/buffer/storage/sql.py @@ -221,8 +221,6 @@ def __init__(self, storage_config: StorageConfig, config: BufferConfig) -> None: self.batch_size = config.batch_size self.is_eval = storage_config.is_eval self.default_workflow_cls = WORKFLOWS.get(storage_config.default_workflow_type) # type: ignore - if self.is_eval and storage_config.default_eval_workflow_type: - self.default_workflow_cls = WORKFLOWS.get(storage_config.default_eval_workflow_type) self.default_reward_fn_cls = REWARD_FUNCTIONS.get(storage_config.default_reward_fn_type) # type: ignore self.formatter = TaskFormatter(storage_config) self.offset = storage_config.index diff --git a/trinity/common/config.py b/trinity/common/config.py index fc6de7d3bd..f96077baf3 100644 --- a/trinity/common/config.py +++ b/trinity/common/config.py @@ -2,6 +2,7 @@ """Configs for RFT.""" from __future__ import annotations +import math import os from copy import deepcopy from dataclasses import dataclass, field @@ -123,9 +124,6 @@ class StorageConfig: # For continuing training index: int = 0 - # used for multi-modal data - mm_data_kwargs: dict = field(default_factory=dict) - # used for StorageType.FILE split: str = "train" subset_name: Optional[str] = None @@ -146,7 +144,6 @@ class StorageConfig: # used for rollout tasks default_workflow_type: Optional[str] = None - default_eval_workflow_type: Optional[str] = None default_reward_fn_type: Optional[str] = None rollout_args: GenerationConfig = field(default_factory=GenerationConfig) workflow_args: dict = field(default_factory=dict) @@ -390,8 +387,6 @@ class ExplorerInput: default_workflow_type: Optional[str] = None default_eval_workflow_type: Optional[str] = None default_reward_fn_type: Optional[str] = None - system_prompt: Optional[str] = None - reply_prefix: Optional[str] = None @dataclass @@ -405,10 +400,6 @@ class TrainerInput: # Some auxiliary buffers to facilitate training (e.g., data mixing) auxiliary_buffers: Dict[str, StorageConfig] = field(default_factory=dict) - # ! Deprecated, keep for backward compatibility, do not use it in new code - sft_warmup_dataset: Optional[StorageConfig] = None - sft_warmup_steps: Optional[int] = None - @dataclass class BufferConfig: @@ -485,7 +476,8 @@ class TrainerConfig: # trainer configs grad_clip: float = 1.0 use_dynamic_bsz: bool = True - ppo_max_token_len_per_gpu: int = 16384 + # if None, automatically set to 2 * model.max_model_len / ulysses_sequence_parallel_size + max_token_len_per_gpu: Optional[int] = None ulysses_sequence_parallel_size: int = 1 # sp size # TODO: extract more train-related params from underlying trainer engine @@ -615,14 +607,6 @@ def save(self, config_path: str) -> None: OmegaConf.save(self, f) def _check_deprecated(self) -> None: - if self.buffer.trainer_input.sft_warmup_steps is not None: - logger.warning( - "`buffer.trainer_input.sft_warmup_steps` is deprecated, SFT warmup related settings are moved to `stages`." - ) - if self.buffer.trainer_input.sft_warmup_dataset is not None: - logger.warning( - "`buffer.trainer_input.sft_warmup_dataset` is deprecated, SFT warmup related settings are moved to `stages`." - ) if self.explorer.runner_num is not None: logger.warning( "`explorer.runner_num` is deprecated, please use `explorer.runner_per_model` instead." @@ -706,17 +690,11 @@ def _check_buffer(self) -> None: # noqa: C901 experience_buffer.total_epochs = self.buffer.total_epochs experience_buffer.total_steps = self.buffer.total_steps else: - taskset.is_eval = False taskset.total_epochs = self.buffer.total_epochs taskset.total_steps = self.buffer.total_steps set_if_none(taskset, "default_workflow_type", explorer_input.default_workflow_type) - set_if_none( - taskset, "default_eval_workflow_type", explorer_input.default_eval_workflow_type - ) set_if_none(taskset, "default_reward_fn_type", explorer_input.default_reward_fn_type) - set_if_none(taskset.format, "system_prompt", explorer_input.system_prompt) - set_if_none(taskset.format, "reply_prefix", explorer_input.reply_prefix) set_if_none(taskset, "ray_namespace", self.ray_namespace) set_if_none(taskset.rollout_args, "max_tokens", self.model.max_response_tokens) @@ -729,13 +707,10 @@ def _check_buffer(self) -> None: # noqa: C901 if not dataset.name: dataset.name = f"eval_taskset_{idx}" set_if_none(dataset, "repeat_times", 1) + # eval_workflow has higher priority than workflow in eval tasksets, so we set it first + set_if_none(dataset, "default_workflow_type", explorer_input.default_eval_workflow_type) set_if_none(dataset, "default_workflow_type", explorer_input.default_workflow_type) - set_if_none( - dataset, "default_eval_workflow_type", explorer_input.default_eval_workflow_type - ) set_if_none(dataset, "default_reward_fn_type", explorer_input.default_reward_fn_type) - set_if_none(dataset.format, "system_prompt", explorer_input.system_prompt) - set_if_none(dataset.format, "reply_prefix", explorer_input.reply_prefix) set_if_none(dataset, "ray_namespace", self.ray_namespace) set_if_none(dataset.rollout_args, "max_tokens", self.model.max_response_tokens) remained_tasksets.append(dataset) @@ -1095,22 +1070,18 @@ def check_and_update(self) -> Config: # noqa: C901 ) self.trainer.trainer_config = OmegaConf.to_object(trainer_config) elif self.trainer.trainer_config_path: - logger.warning( + raise ValueError( "`trainer_config_path` is deprecated; please use `trainer_config` instead." ) - if os.path.isfile(self.trainer.trainer_config_path): - from trinity.common.verl_config import load_config - - self.trainer.trainer_config = load_config(self.trainer.trainer_config_path) - else: - raise ValueError( - f"Invalid trainer config path: {self.trainer.trainer_config_path}" - ) else: from trinity.common.verl_config import veRLConfig logger.info("`trainer_config` is not provided, using default trainer config.") self.trainer.trainer_config = veRLConfig() + if self.trainer.max_token_len_per_gpu is None: + self.trainer.max_token_len_per_gpu = math.ceil( + 2 * self.model.max_model_len / self.trainer.ulysses_sequence_parallel_size # type: ignore [operator] + ) else: raise ValueError(f"Invalid trainer type: {self.trainer_type}") self.trainer.trainer_config.synchronize_config(self) diff --git a/trinity/common/verl_config.py b/trinity/common/verl_config.py index 7a483b8a9b..05c26bfea4 100644 --- a/trinity/common/verl_config.py +++ b/trinity/common/verl_config.py @@ -235,7 +235,7 @@ class Critic: forward_micro_batch_size_per_gpu: Optional[int] = None use_dynamic_bsz: Optional[bool] = None ppo_max_token_len_per_gpu: Optional[int] = None - forward_max_token_len_per_gpu: int = 0 + forward_max_token_len_per_gpu: Optional[int] = None ulysses_sequence_parallel_size: Optional[int] = None ppo_epochs: int = 0 shuffle: bool = False @@ -423,7 +423,7 @@ def synchronize_config(self, config: Config) -> None: # noqa: C901 self.actor_rollout_ref.actor.use_dynamic_bsz = config.trainer.use_dynamic_bsz if self.actor_rollout_ref.actor.ppo_max_token_len_per_gpu is None: self.actor_rollout_ref.actor.ppo_max_token_len_per_gpu = ( - config.trainer.ppo_max_token_len_per_gpu + config.trainer.max_token_len_per_gpu ) if self.actor_rollout_ref.actor.ulysses_sequence_parallel_size is None: self.actor_rollout_ref.actor.ulysses_sequence_parallel_size = ( @@ -432,14 +432,17 @@ def synchronize_config(self, config: Config) -> None: # noqa: C901 if ( self.actor_rollout_ref.actor.ppo_max_token_len_per_gpu # type: ignore [operator] * self.actor_rollout_ref.actor.ulysses_sequence_parallel_size - < config.model.max_model_len + < config.model.max_model_len * 2 # type: ignore [operator] ): self.actor_rollout_ref.actor.ppo_max_token_len_per_gpu = math.ceil( config.model.max_model_len # type: ignore [operator] - / self.actor_rollout_ref.actor.ulysses_sequence_parallel_size + * 2 + / self.actor_rollout_ref.actor.ulysses_sequence_parallel_size # type: ignore [operator] ) logger.warning( - f"Warning: actor.ppo_max_token_len_per_gpu is automatically set to {self.actor_rollout_ref.actor.ppo_max_token_len_per_gpu} to match model.max_model_len ({config.model.max_model_len})" + f"actor.ppo_max_token_len_per_gpu is automatically set to {self.actor_rollout_ref.actor.ppo_max_token_len_per_gpu} " + f"to match model.max_model_len ({config.model.max_model_len}). If you face OOM issues, " + "please set `model.max_model_len` to a smaller value." ) # Ref Config @@ -447,12 +450,27 @@ def synchronize_config(self, config: Config) -> None: # noqa: C901 self.actor_rollout_ref.ref.log_prob_use_dynamic_bsz = config.trainer.use_dynamic_bsz if self.actor_rollout_ref.ref.log_prob_max_token_len_per_gpu is None: self.actor_rollout_ref.ref.log_prob_max_token_len_per_gpu = ( - config.trainer.ppo_max_token_len_per_gpu + self.actor_rollout_ref.actor.ppo_max_token_len_per_gpu ) if self.actor_rollout_ref.ref.ulysses_sequence_parallel_size is None: self.actor_rollout_ref.ref.ulysses_sequence_parallel_size = ( config.trainer.ulysses_sequence_parallel_size ) + if ( + self.actor_rollout_ref.ref.log_prob_max_token_len_per_gpu # type: ignore [operator] + * self.actor_rollout_ref.ref.ulysses_sequence_parallel_size + < config.model.max_model_len * 2 # type: ignore [operator] + ): + self.actor_rollout_ref.ref.log_prob_max_token_len_per_gpu = math.ceil( + config.model.max_model_len # type: ignore [operator] + * 2 + / self.actor_rollout_ref.ref.ulysses_sequence_parallel_size # type: ignore [operator] + ) + logger.warning( + f"ref.log_prob_max_token_len_per_gpu is automatically set to {self.actor_rollout_ref.ref.log_prob_max_token_len_per_gpu} " + f"to match model.max_model_len ({config.model.max_model_len}). If you face OOM issues, " + "please set `model.max_model_len` to a smaller value." + ) # Critic config self.critic.strategy = self.actor_rollout_ref.actor.strategy @@ -466,21 +484,30 @@ def synchronize_config(self, config: Config) -> None: # noqa: C901 if self.critic.use_dynamic_bsz is None: self.critic.use_dynamic_bsz = config.trainer.use_dynamic_bsz if self.critic.ppo_max_token_len_per_gpu is None: - self.critic.ppo_max_token_len_per_gpu = config.trainer.ppo_max_token_len_per_gpu + self.critic.ppo_max_token_len_per_gpu = ( + self.actor_rollout_ref.actor.ppo_max_token_len_per_gpu + ) if self.critic.ulysses_sequence_parallel_size is None: self.critic.ulysses_sequence_parallel_size = ( config.trainer.ulysses_sequence_parallel_size ) + if ( self.critic.ppo_max_token_len_per_gpu * self.critic.ulysses_sequence_parallel_size # type: ignore [operator] - < config.model.max_model_len + < config.model.max_model_len * 2 # type: ignore [operator] ): self.critic.ppo_max_token_len_per_gpu = math.ceil( - config.model.max_model_len / self.critic.ulysses_sequence_parallel_size # type: ignore [operator] + config.model.max_model_len # type: ignore [operator] + * 2 + / self.critic.ulysses_sequence_parallel_size # type: ignore [operator] ) logger.warning( - f"Warning: critic.ppo_max_token_len_per_gpu is automatically set to {self.critic.ppo_max_token_len_per_gpu} to match model.max_model_len ({config.model.max_model_len})" + f"critic.ppo_max_token_len_per_gpu is automatically set to {self.critic.ppo_max_token_len_per_gpu} " + f"to match model.max_model_len ({config.model.max_model_len}). If you face OOM issues, " + "please set `model.max_model_len` to a smaller value." ) + if self.critic.forward_max_token_len_per_gpu is None: + self.critic.forward_max_token_len_per_gpu = self.critic.ppo_max_token_len_per_gpu # LoRA related config if config.model.lora_configs is not None: