ShenQianli · ShenQianli · Nov 5, 2025 · Nov 5, 2025
diff --git a/examples/bots/bots.yaml b/examples/bots/bots.yaml
@@ -0,0 +1,79 @@
+project: "BOTS-Selector"
+name: "qwen2.5-1.5B-instruct-bots"
+checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,./checkpoints}
+data_processor:
+  experience_pipeline:
+    operators:
+      - name: pass_rate_calculator
+algorithm:
+  algorithm_type: grpo
+  repeat_times: 16
+  optimizer:
+    lr: 1e-6
+model:
+  model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen2.5-1.5B-Instruct}
+  max_prompt_tokens: 4096
+  max_response_tokens: 8192
+cluster:
+  node_num: 1
+  gpu_per_node: 8
+buffer:
+  total_epochs: 1
+  batch_size: 32
+  explorer_input:
+    taskset:
+      name: math-train
+      storage_type: file
+      path: '<DATA_ROOT>/LLM360/guru-RL-92k/train/math__combined_54.4k.parquet'
+      split: 'train'
+      format:
+        prompt_key: 'prompt'
+        response_key: 'reward_model.ground_truth'
+      rollout_args:
+        temperature: 1.0
+      task_selector:
+        selector_type: difficulty_based
+        feature_keys: [ "qwen2.5_7b_pass_rate", "qwen3_30b_pass_rate" ]
+        kwargs:
+          m: 16
+          lamb: 0.1
+          rho: 0.1
+          target_reward: 0.5
+          tau: 0
+          do_sample: true
+    eval_tasksets:
+    - name: math-eval
+      storage_type: file
+      path: '<DATA_ROOT>/LLM360/guru-RL-92k/online_eval/math__math_500.parquet'
+      format:
+        prompt_key: 'prompt'
+        response_key: 'reward_model.ground_truth'
+      rollout_args:
+        temperature: 1.0
+    default_workflow_type: 'bots_math_boxed_workflow'
+  trainer_input:
+    experience_buffer:
+      name: exp_buffer
+      storage_type: queue
+      path: 'sqlite:///bots_trainer_buffer.db'
+explorer:
+  eval_interval: 40
+  runner_per_model: 8
+  rollout_model:
+    engine_num: 4
+    tensor_parallel_size: 1
+    enable_prefix_caching: false
+    enforce_eager: true
+    dtype: bfloat16
+    seed: 42
+synchronizer:
+  sync_method: 'nccl'
+  sync_interval: 8
+  sync_timeout: 1200
+trainer:
+  trainer_type: 'verl'
+  save_interval: 800
+  grad_clip: 1.0
+  use_dynamic_bsz: true
+  max_token_len_per_gpu: 24576
+  ulysses_sequence_parallel_size: 1
diff --git a/examples/bots/plugins/bots_math_boxed_reward.py b/examples/bots/plugins/bots_math_boxed_reward.py
@@ -0,0 +1,32 @@
+from typing import Optional
+
+from trinity.common.rewards.reward_fn import REWARD_FUNCTIONS, RewardFn
+from trinity.utils.eval_utils import validate_think_pattern
+
+from .bots_reward import compute_score
+
+@REWARD_FUNCTIONS.register_module("bots_math_boxed_reward")
+class BOTSMathBoxedRewardFn(RewardFn):
+    """A reward function that rewards for math task for BOTS."""
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        pass
+
+    def __call__(  # type: ignore
+        self,
+        response: str,
+        truth: Optional[str] = None,
+        with_think: Optional[bool] = False,
+        format_score_coef: Optional[float] = 0.1,
+        **kwargs,
+    ) -> dict[str, float]:
+        accuracy_score = compute_score(response, truth)
+
+        format_score = 0.0
+        if with_think and not validate_think_pattern(response):
+            format_score = (format_score_coef or 0.1) * -1.0
+
+        return {"accuracy": accuracy_score, "format_score": format_score}
diff --git a/examples/bots/plugins/bots_math_boxed_workflow.py b/examples/bots/plugins/bots_math_boxed_workflow.py
@@ -0,0 +1,16 @@
+from trinity.common.workflows.customized_math_workflows import MathBoxedWorkflow, Task
+from trinity.common.workflows.workflow import WORKFLOWS
+
+from .bots_math_boxed_reward import BOTSMathBoxedRewardFn
+
+@WORKFLOWS.register_module("bots_math_boxed_workflow")
+class BOTSMathBoxedWorkflow(MathBoxedWorkflow):
+    """A workflow for math tasks that give answers in boxed format for BOTS."""
+
+    def reset(self, task: Task):
+        super().reset(task)
+        self.reward_fn = BOTSMathBoxedRewardFn(**self.reward_fn_args)
+
+    def format_messages(self):
+        # the prompts are already in message format
+        return self.task_desc