From b2190ea024fc93acf0b2059a704cd814fe487eda Mon Sep 17 00:00:00 2001
From: kokolerk <2509260383@qq.com>
Date: Mon, 15 Dec 2025 17:53:39 +0800
Subject: [PATCH 1/5] alfworld concatenated multi-turn rft sft format

---
 examples/grpo_alfworld/README.md     | 50 ++++++++++++++++++++++++++++
 examples/grpo_alfworld/alfworld.yaml |  7 ++--
 2 files changed, 54 insertions(+), 3 deletions(-)
diff --git a/examples/grpo_alfworld/README.md b/examples/grpo_alfworld/README.md
index 0cb710d0ba..e4c241372e 100644
--- a/examples/grpo_alfworld/README.md
+++ b/examples/grpo_alfworld/README.md
@@ -5,3 +5,53 @@ This example shows the usage of GRPO on the ALFWorld dataset.
 For more detailed information, please refer to the [documentation](../../docs/sphinx_doc/source/tutorial/example_multi_turn.md).
 
 The config file is located in [`alfworld.yaml`](alfworld.yaml).
+
+NOTE: For the Concatenated Multi-Turn RFT setup in the Qwen-2.5 series, you need to perform SFT first, then GRPO, or else the reward will be -0.1 during GRPO training due to not following the `<think></think><action></action>` format.
+
+The SFT data should be named as xxx.json, following the format: 
+
+
+
+```
+[
+    {
+         "messages": [
+            {
+                "role": "system", # fixed, align with the grpo workflow: alfworld_workflow.
+                "content": "\nYou are an agent interacting with a virtual test-based environments.\n\n## Notes:\nAt each step, you should first think then perform action to fulfill the instruction. You should ALWAYS wrap your thinking with the   tag and wrap your action with the   tag.\nYou should ALWAYS take one action each step. \nYou should finish the task and buy the item within 15 steps.\nDONOT try to interact with the user at anytime. Finish the task and buy the item by yourself.\n\n## Action Format:\nBelow are the available commands you can use:\n  look:                             look around your current location\n  inventory:                        check your current inventory(you can only have 1 item in your inventory)\n  go to (receptacle):               move to a receptacle\n  open (receptacle):                open a receptacle\n  close (receptacle):               close a receptacle\n  take (object) from (receptacle):  take an object from a receptacle\n  move (object) to (receptacle):  place an object in or on a receptacle\n  examine (something):              examine a receptacle or an object\n  use (object):                     use an object\n  heat (object) with (receptacle):  heat an object using a receptacle\n  clean (object) with (receptacle): clean an object using a receptacle\n  cool (object) with (receptacle):  cool an object using a receptacle\n  slice (object) with (object):     slice an object using a sharp object\n\nFor example your output should be like this:\n To solve the task, I need first to ... go to cabinet 1\n"
+            },
+            {
+                "role": "user",
+                "content": "Observation: {observation by alfworld}"
+            },
+            {
+                "role": "assistant",
+                "content": "<think>think process</think><action>action</action>"
+            },
+             {
+                "role": "user",
+                "content": "Observation: {observation by alfworld}"
+            },
+             {
+                "role": "assistant",
+                "content": "<think>think process</think><action>action</action>"
+            },
+            .....
+        ],
+    },
+    {
+        "messages": [
+            {
+                ......
+            },
+        ]
+    },
+    {
+         "messages": [
+            {
+                .......
+            },
+        ]
+    },
+]
+```
\ No newline at end of file
diff --git a/examples/grpo_alfworld/alfworld.yaml b/examples/grpo_alfworld/alfworld.yaml
index 32a491e1d7..dcfd21d3db 100644
--- a/examples/grpo_alfworld/alfworld.yaml
+++ b/examples/grpo_alfworld/alfworld.yaml
@@ -7,9 +7,9 @@ algorithm:
   optimizer:
     lr: 1e-6
 model:
-  model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen2.5-7B-Instruct}
-  max_response_tokens: 16384
-  max_model_len: 20480
+  model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen2.5-3B-Instruct}
+  max_prompt_tokens: 10240 # input max tokens every turn
+  max_response_tokens: 4096 # output max tokens every turn
 cluster:
   node_num: 1
   gpu_per_node: 8
@@ -77,4 +77,5 @@ trainer:
 #           format:
 #             prompt_type: messages
 #             messages_key: 'messages'
+#.            enable_concatenated_multi_turn: true # Enable concatenated multi-turn SFT data preprocess, default is false
 #   - stage_name: rft

From e0f70da6054de8a16b777da51599c0b052770f26 Mon Sep 17 00:00:00 2001
From: kokolerk <2509260383@qq.com>
Date: Mon, 15 Dec 2025 18:26:52 +0800
Subject: [PATCH 2/5] update readme

---
 examples/grpo_alfworld/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/grpo_alfworld/README.md b/examples/grpo_alfworld/README.md
index e4c241372e..7ab715f804 100644
--- a/examples/grpo_alfworld/README.md
+++ b/examples/grpo_alfworld/README.md
@@ -8,7 +8,7 @@ The config file is located in [`alfworld.yaml`](alfworld.yaml).
 
 NOTE: For the Concatenated Multi-Turn RFT setup in the Qwen-2.5 series, you need to perform SFT first, then GRPO, or else the reward will be -0.1 during GRPO training due to not following the `<think></think><action></action>` format.
 
-The SFT data should be named as xxx.json, following the format: 
+The SFT data should be named as `<TRINITY_SFT_DATASET_PATH>/data.json`, following the format: 
 
 
 
@@ -28,11 +28,11 @@ The SFT data should be named as xxx.json, following the format:
                 "role": "assistant",
                 "content": "<think>think process</think><action>action</action>"
             },
-             {
+            {
                 "role": "user",
                 "content": "Observation: {observation by alfworld}"
             },
-             {
+            {
                 "role": "assistant",
                 "content": "<think>think process</think><action>action</action>"
             },

From 0485aef5a6472a4dcaa3e24eb268933b1db9a6d3 Mon Sep 17 00:00:00 2001
From: Yuchang Sun <52027540+hiyuchang@users.noreply.github.com>
Date: Mon, 15 Dec 2025 18:37:01 +0800
Subject: [PATCH 3/5] Apply suggestion from @hiyuchang

---
 examples/grpo_alfworld/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_alfworld/README.md b/examples/grpo_alfworld/README.md
index 7ab715f804..19280a9c6b 100644
--- a/examples/grpo_alfworld/README.md
+++ b/examples/grpo_alfworld/README.md
@@ -6,7 +6,7 @@ For more detailed information, please refer to the [documentation](../../docs/sp
 
 The config file is located in [`alfworld.yaml`](alfworld.yaml).
 
-NOTE: For the Concatenated Multi-Turn RFT setup in the Qwen-2.5 series, you need to perform SFT first, then GRPO, or else the reward will be -0.1 during GRPO training due to not following the `<think></think><action></action>` format.
+NOTE: For the Concatenated Multi-Turn RFT setup in the Qwen-2.5 series, the model may not follow the `<think></think><action></action>` format. You may need to perform SFT first, then GRPO.
 
 The SFT data should be named as `<TRINITY_SFT_DATASET_PATH>/data.json`, following the format: 
 

From 13cb5ceb2b94fcb41667f6344d0a251f67c6db61 Mon Sep 17 00:00:00 2001
From: Jiaqi Wang <68948604+kokolerk@users.noreply.github.com>
Date: Mon, 15 Dec 2025 18:42:30 +0800
Subject: [PATCH 4/5] update readme

---
 examples/grpo_alfworld/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/grpo_alfworld/README.md b/examples/grpo_alfworld/README.md
index 19280a9c6b..ae116f9fd5 100644
--- a/examples/grpo_alfworld/README.md
+++ b/examples/grpo_alfworld/README.md
@@ -54,4 +54,4 @@ The SFT data should be named as `<TRINITY_SFT_DATASET_PATH>/data.json`, followin
         ]
     },
 ]
-```
\ No newline at end of file
+```

From 4d726d1ab45d1a8f5cb6d6a1ea9d69bb2d308e94 Mon Sep 17 00:00:00 2001
From: Yuchang Sun <52027540+hiyuchang@users.noreply.github.com>
Date: Mon, 15 Dec 2025 19:24:14 +0800
Subject: [PATCH 5/5] Apply suggestion from @hiyuchang

---
 examples/grpo_alfworld/README.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/grpo_alfworld/README.md b/examples/grpo_alfworld/README.md
index ae116f9fd5..16c37f7fe5 100644
--- a/examples/grpo_alfworld/README.md
+++ b/examples/grpo_alfworld/README.md
@@ -8,9 +8,7 @@ The config file is located in [`alfworld.yaml`](alfworld.yaml).
 
 NOTE: For the Concatenated Multi-Turn RFT setup in the Qwen-2.5 series, the model may not follow the `<think></think><action></action>` format. You may need to perform SFT first, then GRPO.
 
-The SFT data should be named as `<TRINITY_SFT_DATASET_PATH>/data.json`, following the format: 
-
-
+The SFT data should be named as `<TRINITY_SFT_DATASET_PATH>/data.json`, following the format:
 
 ```
 [