agentscope-ai · pan-x-c · Aug 21, 2025 · Aug 8, 2025 · Aug 12, 2025 · Aug 14, 2025
diff --git a/docs/sphinx_doc/source/tutorial/example_data_functionalities.md b/docs/sphinx_doc/source/tutorial/example_data_functionalities.md
diff --git a/docs/sphinx_doc/source/tutorial/example_mix_algo.md b/docs/sphinx_doc/source/tutorial/example_mix_algo.md
@@ -60,9 +60,8 @@ class MIXAlgorithm(AlgorithmType):
     def default_config(cls) -> Dict:
         return {
             "repeat_times": 8,
-            "add_strategy": "grpo",
-            "policy_loss_fn": "mix",
             "advantage_fn": "grpo",
+            "policy_loss_fn": "mix",
             "sample_strategy": "mix",
         }
 ```

diff --git a/docs/sphinx_doc/source/tutorial/example_react.md b/docs/sphinx_doc/source/tutorial/example_react.md
@@ -19,7 +19,7 @@ Trinity-RFT is designed to be highly modular. You can easily embed complex, pre-
 Modern agentic tasks often involve multiple steps of reasoning, tool use, and observation. Trinity-RFT natively supports training across these Multi-Step interactions.
 
 - **Step-Wise Experience Generation**: Instead of only learning from the final answer, Trinity can treat each step within an agent's reasoning trajectory as a distinct learning opportunity.
-- **Credit Assignment**: The reward for solving a task is propagated back to all experiences within the successful trajectory, enabling the model to learn the entire reasoning chain, not just the final response. This is controlled by the `add_strategy` in the config.
+- **Credit Assignment**: The reward for solving a task is propagated back to all experiences within the successful trajectory, enabling the model to learn the entire reasoning chain, not just the final response. This is controlled by the `advantage_fn` in the config.
 
 ### Native Tool Calling Support
 Trinity-RFT's inference engine and training pipeline are built to support the native OpenAI `tool_calls` format.
@@ -83,7 +83,7 @@ This setting in the `algorithm` section defines how experiences from a Multi-Ste
 ```yaml
 algorithm:
   algorithm_type: grpo
-  add_strategy: step_wise_grpo # Key for Multi-Step training
+  advantage_fn: step_wise_grpo # Key for Multi-Step training
 ```
 -   `step_wise_grpo`: This strategy tells Trinity to create a distinct training sample for each step in the agent's execution path. The `grpo` algorithm then uses these samples to update the model.
 

diff --git a/docs/sphinx_doc/source/tutorial/example_step_wise.md b/docs/sphinx_doc/source/tutorial/example_step_wise.md
@@ -77,7 +77,7 @@ and include it in the init file `trinity/common/workflows/__init__.py`
 
 In general multi-step scenarios, each run may generate various number of experiences. To accomodate this case, we provide some flexible designs.
 
-- `algorithm.add_strategy = step_wise_grpo`: This function allows you compute the advantages for the collected experience before adding to the buffer. For this example, we use `step_wise_grpo` which broadcasts advantages from the last step to previous steps.
+- `algorithm.advantage_fn = step_wise_grpo`: This function allows you compute the advantages for the collected experience before adding to the buffer. For this example, we use `step_wise_grpo` which broadcasts advantages from the last step to previous steps.
 
 - `buffer.train_batch_size`: The number of experiences to be sampled from the buffer for training, which can be different from the number of generated experiences in each explore step.
 
@@ -95,7 +95,7 @@ checkpoint_root_dir: /PATH/TO/CHECKPOINT/ALFWORLD_RFT/
 algorithm:
   algorithm_type: grpo
   repeat_times: 16
-  add_strategy: step_wise_grpo
+  advantage_fn: step_wise_grpo
 model:
   model_path: /PATH/TO/MODEL/
   max_response_tokens: 16384

diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md
@@ -90,7 +90,6 @@ algorithm:
   kl_penalty_fn: "none"
   kl_loss_fn: "k2"
   entropy_loss_fn: "default"
-  add_strategy: null
 ```
 
 - `algorithm_type`: Type of reinforcement learning algorithm. Supported types: `ppo`, `grpo`, `opmd`, `dpo`, `sft`, `mix`.
@@ -100,7 +99,6 @@ algorithm:
 - `kl_penalty_fn`: The KL penalty function used for computing KL penalty applied in reward.
 - `kl_loss_fn`: The KL loss function used for computing KL loss.
 - `entropy_loss_fn`: The entropy loss function used for computing entropy loss.
-- `add_strategy`: Strategy for adding new experiences to the experience buffer. If set, explorer will collect experiences from workflow runners and pre-process them before adding to the buffer.
 
 ---
 

diff --git a/docs/sphinx_doc/source/tutorial/trinity_programming_guide.md b/docs/sphinx_doc/source/tutorial/trinity_programming_guide.md
diff --git a/environments/env_mapping.json b/environments/env_mapping.json
@@ -2,7 +2,7 @@
   "trinity.data": {
     "env_name": "trinity_data",
     "env_yaml": "environments/data.yaml",
-    "env_entry": "trinity/data/server.py"
+    "env_entry": "trinity/service/data_juicer/server/server.py"
   },
   "trinity.training": {
     "env_name": "trinity",

diff --git a/examples/agentscope_tool_react/agentscope_tool_react_dapo.yaml b/examples/agentscope_tool_react/agentscope_tool_react_dapo.yaml
@@ -4,7 +4,7 @@ checkpoint_root_dir: /PATH/TO/CHECKPOINT/
 algorithm:
   algorithm_type: grpo
   repeat_times: 8
-  add_strategy: step_wise_grpo
+  advantage_fn: step_wise_grpo
 model:
   model_path: /PATH/TO/MODEL/Qwen3-8B
   max_response_tokens: 16384

diff --git a/examples/agentscope_tool_react/agentscope_tool_react_gsm8k.yaml b/examples/agentscope_tool_react/agentscope_tool_react_gsm8k.yaml
@@ -4,7 +4,7 @@ checkpoint_root_dir: /PATH/TO/CHECKPOINT/
 algorithm:
   algorithm_type: grpo
   repeat_times: 8
-  add_strategy: step_wise_grpo
+  advantage_fn: step_wise_grpo
 model:
   model_path: /PATH/TO/MODEL/Qwen3-4B
   max_response_tokens: 16384

diff --git a/examples/grpo_alfworld_general_multi_step/alfworld.yaml b/examples/grpo_alfworld_general_multi_step/alfworld.yaml
@@ -4,7 +4,7 @@ checkpoint_root_dir: /PATH/TO/CHECKPOINT/ALFWORLD_RFT/
 algorithm:
   algorithm_type: grpo
   repeat_times: 16
-  add_strategy: step_wise_grpo
+  advantage_fn: step_wise_grpo
 model:
   model_path: /PATH/TO/MODEL/
   max_response_tokens: 16384

diff --git a/examples/grpo_gsm8k_experience_pipeline/dj_scoring_exp.yaml b/examples/grpo_gsm8k_experience_pipeline/dj_scoring_exp.yaml
@@ -7,5 +7,5 @@ process:
   - llm_quality_score_filter:
       api_or_hf_model: "qwen2.5-32b-instruct"  # use "qwen2.5-32b-instruct" to calculate the quality scores.
       min_score: 0.0
-      input_keys: ["prompt_text", "prompt_text"]  # set input_keys and field_names to the existing key names in gsm-8k. Here calculating the difficulty scores according to both questions and answers.
+      input_keys: ["prompt_text", "response_text"]  # set input_keys and field_names to the existing key names in gsm-8k. Here calculating the difficulty scores according to both questions and answers.
       field_names: ["prompt", "response"]
diff --git a/examples/grpo_gsm8k_experience_pipeline/gsm8k.yaml b/examples/grpo_gsm8k_experience_pipeline/gsm8k.yaml
@@ -1,29 +1,30 @@
-project: "Trinity-RFT-gsm8k-experience-pipeline"
-name: "qwen2.5-1.5B-gsm8k-experience-pipeline"
+project: "Trinity-RFT-gsm8k-exp-pipe"
+name: "qwen2.5-1.5B-gsm8k-exp-pipe"
 checkpoint_root_dir: /PATH/TO/CHECKPOINT/
-algorithm:
-  algorithm_type: grpo
-  repeat_times: 8
+
+service:
+  data_juicer:
+    server_url: 'http://127.0.0.1:5005'
+    port: 5005
 data_processor:
-  data_processor_url: 'http://127.0.0.1:5005/data_processor'
   # experience pipeline related
   experience_pipeline:
-    # I/O buffers
-    input_buffers:
-      - name: gsm8k_exp_output
-    output_buffer:
-      name: reshaped_gsm8k_exp_input
-    # format mapping
-    format:
-      reward_key: 'reward'  # the key name of the reward in the experience
-    # data active iterator related
-    dj_config_path: 'examples/grpo_gsm8k_experience_pipeline/dj_scoring_exp.yaml'
-    clean_strategy: 'iterative'
-    # reward shaping
-    reward_shaping:
-      - stats_key: 'llm_quality_score'
-        op_type: ADD
-        weight: 1.0
+    operators:
+      - name: data_juicer
+        args:
+          config_path: 'examples/grpo_gsm8k_experience_pipeline/dj_scoring_exp.yaml'
+      - name: reward_shaping_mapper
+        args:
+          reward_shaping_configs:
+            - stats_key: 'llm_quality_score'
+              op_type: ADD
+              weight: 1.0
+    save_input: false
+
+
+algorithm:
+  algorithm_type: grpo
+  repeat_times: 8
 model:
   model_path: /PATH/TO/MODEL/
   max_response_tokens: 1024
@@ -58,17 +59,13 @@ buffer:
         prompt_key: 'question'
         response_key: 'answer'
     default_workflow_type: 'math_workflow'
-  explorer_output:
-    name: gsm8k_exp_output
-    storage_type: queue
-    path: 'sqlite:///gsm8k_exp_output.db'
   trainer_input:
     experience_buffer:
-      name: reshaped_gsm8k_exp_input
+      name: reshaped_gsm8k_buffer
       storage_type: queue
-      path: 'sqlite:///reshaped_gsm8k_exp_input.db'
+      path: 'sqlite:///reshaped_gsm8k.db'
 explorer:
-  eval_interval: 50
+  eval_interval: 6
   runner_num: 32
   rollout_model:
     engine_type: vllm_async

diff --git a/examples/grpo_gsm8k_task_pipeline/gsm8k.yaml b/examples/grpo_gsm8k_task_pipeline/gsm8k.yaml
@@ -4,28 +4,25 @@ checkpoint_root_dir: /PATH/TO/CHECKPOINT/
 algorithm:
   algorithm_type: grpo
   repeat_times: 8
+
+service:
+  data_juicer:
+    auto_start: true
 data_processor:
-  data_processor_url: 'http://127.0.0.1:5005/data_processor'
   # task pipeline related
   task_pipeline:
-    # I/O buffers
-    input_buffers:
-      - name: 'raw_input'
-        path: 'openai/gsm8k'
-        storage_type: 'file'
-        raw: true
-    output_buffer:
-      name: 'raw_output'
-      path: './outputs/task_pipeline_output/prioritized_gsm8k.jsonl'
-      storage_type: 'file'
-    # format mapping
-    format:
-      prompt_key: 'question'
-      response_key: 'answer'
-    # data active iterator related
-    dj_process_desc: 'Please compute difficulty scores for these math questions.'
-    agent_model_name: 'qwen-max'
-    clean_strategy: 'iterative'
+    num_process: 32
+    operators:
+      - name: "llm_difficulty_score_filter"
+        args:
+          api_or_hf_model: "qwen2.5-7b-instruct"
+          min_score: 0.0
+          input_keys: ["question", "answer"]
+          field_names: ["Question", "Answer"]
+    inputs:  # the output will be set to the explorer input automatically
+      - /PATH/TO/GSM8K/DATA/FILE
+    target_fields: ["question", "answer"]
+
 model:
   model_path: /PATH/TO/MODEL/
   max_response_tokens: 1024
@@ -64,11 +61,6 @@ buffer:
       name: gsm8k_buffer
       storage_type: queue
       path: 'sqlite:///gsm8k.db'
-    # sft_warmup_steps: 0
-    # sft_warmup_dataset: # Uncomment these to enable sft warmup
-    #   name: warmup_data
-    #   storage_type: file
-    #   path: '/PATH/TO/WARMUP_DATA/'
 explorer:
   eval_interval: 50
   runner_num: 32

diff --git a/tests/algorithm/__init__.py b/tests/algorithm/__init__.py