diff --git a/agential/cog/agent/react.py b/agential/cog/agent/react.py
index ace85ee45..6eafacffe 100644
--- a/agential/cog/agent/react.py
+++ b/agential/cog/agent/react.py
@@ -96,13 +96,17 @@ def generate(
             )
 
             # Observe.
-            obs = self.strategy.generate_observation(
+            obs, external_tool_info = self.strategy.generate_observation(
                 idx=idx, action_type=action_type, query=query
             )
 
             out.append(
                 self.strategy.create_output_dict(
-                    thought=thought, action_type=action_type, query=query, obs=obs
+                    thought=thought,
+                    action_type=action_type,
+                    query=query,
+                    obs=obs,
+                    external_tool_info=external_tool_info,
                 )
             )
 
diff --git a/agential/cog/agent/reflexion.py b/agential/cog/agent/reflexion.py
index eccb62826..82fcabda2 100644
--- a/agential/cog/agent/reflexion.py
+++ b/agential/cog/agent/reflexion.py
@@ -265,7 +265,7 @@ def _generate_react(
             )
 
             # Observe.
-            is_correct, obs = self.strategy.generate_observation(
+            is_correct, obs, external_tool_info = self.strategy.generate_observation(
                 step_idx=step_idx,
                 action_type=action_type,
                 query=query,
@@ -278,6 +278,7 @@ def _generate_react(
                     action_type=action_type,
                     query=query,
                     obs=obs,
+                    external_tool_info=external_tool_info,
                     is_correct=is_correct,
                 )
             )
diff --git a/agential/cog/eval/reflexion.py b/agential/cog/eval/reflexion.py
index 3612e3bdc..437b25e7c 100644
--- a/agential/cog/eval/reflexion.py
+++ b/agential/cog/eval/reflexion.py
@@ -3,7 +3,7 @@
 from agential.utils.parse import normalize_answer
 
 
-def EM(answer: str, key: str) -> bool:
+def EM(answer: str, key: str, normalize: bool = True) -> bool:
     """Compares two strings, `answer` and `key`, after normalizing them.
 
     The Exact Match grading 'metric' compares for an exact match between 2 strings
@@ -12,8 +12,14 @@ def EM(answer: str, key: str) -> bool:
     Args:
         answer (str): A string to be compared with `key`.
         key (str): A string to be compared with `answer`.
+        normalize (bool): If True, then normalize answer and key. Defaults to True.
 
     Returns:
         bool: True if the normalized `answer` and `key` match, else False.
     """
+    if answer is None:
+        return False
+
+    if not normalize:
+        return answer == key
     return normalize_answer(answer) == normalize_answer(key)
diff --git a/agential/cog/prompts/agent/reflexion.py b/agential/cog/prompts/agent/reflexion.py
index 9d15b7ece..488ebf8db 100644
--- a/agential/cog/prompts/agent/reflexion.py
+++ b/agential/cog/prompts/agent/reflexion.py
@@ -602,3 +602,1465 @@
 Action 2: Finish[Lindsey Vonn]
 
 Reflection: My reasoning failed because I doubted the clear evidence provided by the source and made an incorrect assumption based on the fame of another skier. In the future, I should rely on the provided evidence rather than making unsupported assumptions."""
+
+
+# ======================================================================== GSM8K ======================================================================== #
+
+
+GSM8K_FEWSHOT_EXAMPLES_REFLEXION_COT_REFLECT = """Question: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?
+Thought: Let's think step by step. Jason had 20 lollipops initially and now he has 12 lollipops. So, he must have given 20 + 12 = 32 lollipops to Denny.
+Action: Finish[
+```python
+jason_lollipops_initial = 20
+jason_lollipops_after = 12
+denny_lollipops = jason_lollipops_initial + jason_lollipops_after
+answer = denny_lollipops
+```
+]
+
+Reflection: My reasoning failed because I incorrectly added the initial and remaining lollipops instead of subtracting the remaining from the initial to find out how many were given away. In the future, I should ensure to carefully review the mathematical operations required to solve the problem accurately.
+
+---
+
+Question: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
+Thought: Let's think step by step. There were initially 15 trees and after planting, there are 21 trees. So, the number of trees planted is 15 - 21 = -6.
+Action: Finish[
+```python
+trees_initial = 15
+trees_after = 21
+trees_added = trees_initial - trees_after
+answer = trees_added
+```
+]
+
+Reflection: My reasoning failed because I subtracted the final number of trees from the initial number of trees instead of subtracting the initial number from the final number. In the future, I should ensure to correctly apply the operations needed to find the difference.
+
+---
+
+Question: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?
+Thought: Let's think step by step. Shawn had 5 toys initially, and he got 2 toys from his mom and 2 toys from his dad. So, the total number of toys he has now is 5 + 2 = 7.
+Action: Finish[
+```python
+toys_initial = 5
+mom_toys = 2
+dad_toys = 2
+total_toys = toys_initial + mom_toys
+answer = total_toys
+```
+]
+
+Reflection: My reasoning failed because I only added the toys from his mom and forgot to include the toys from his dad. In the future, I should ensure to account for all given quantities in the problem to arrive at the correct solution.
+
+---
+
+Question: There were nine computers in the server room. Five more computers were installed each day, from Monday to Thursday. How many computers are now in the server room?
+Thought: Let's think step by step. Initially, there were 9 computers. Five more computers were added each day from Monday to Thursday, which is 4 days. So, the total number of computers added is 5 + 4 = 9. Therefore, the total number of computers now is 9 + 9 = 18.
+Action: Finish[
+```python
+computers_initial = 9
+computers_per_day = 5
+num_days = 4  # 4 days between Monday and Thursday
+computers_added = computers_per_day + num_days
+computers_total = computers_initial + computers_added
+answer = computers_total
+```
+]
+
+Reflection: My reasoning failed because I incorrectly added the number of computers per day to the number of days instead of multiplying them. In the future, I should ensure to use the correct operations to calculate the total number of items added.
+
+---
+
+Question: Michael had 58 golf balls. On Tuesday, he lost 23 golf balls. On Wednesday, he lost 2 more. How many golf balls did he have at the end of Wednesday?
+Thought: Let's think step by step. Michael initially had 58 golf balls. He lost 23 golf balls on Tuesday and 2 more on Wednesday. So, the total number of golf balls lost is 23 + 2 = 25. Therefore, the number of golf balls he has left is 58 + 25 = 83.
+Action: Finish[
+```python
+golf_balls_initial = 58
+golf_balls_lost_tuesday = 23
+golf_balls_lost_wednesday = 2
+golf_balls_lost_total = golf_balls_lost_tuesday + golf_balls_lost_wednesday
+golf_balls_left = golf_balls_initial + golf_balls_lost_total
+answer = golf_balls_left
+```
+]
+
+Reflection: My reasoning failed because I incorrectly added the total number of golf balls lost to the initial number of golf balls instead of subtracting it. In the future, I should ensure to use the correct operation (subtraction) to find the remaining number of items.
+
+---
+
+Question: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
+Thought: Let's think step by step. Initially, there are 3 cars in the parking lot. When 2 more cars arrive, the total number of cars is 3 - 2 = 1.
+Action: Finish[
+```python
+cars_initial = 3
+cars_arrived = 2
+total_cars = cars_initial - cars_arrived
+answer = total_cars
+```
+]
+
+Reflection: My reasoning failed because I incorrectly subtracted the number of cars that arrived from the initial number of cars instead of adding them. In the future, I should ensure to use the correct operation (addition) when calculating the total number of items after more items arrive.
+
+---
+
+Question: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?
+Thought: Let's think step by step. Olivia had $23 initially. She bought 5 bagels for $3 each. The total cost of the bagels is 5 * 3 = $15. So, the money she has left is 23 + 15 = $38.
+Action: Finish[
+```python
+money_initial = 23
+bagels = 5
+bagel_cost = 3
+money_spent = bagels * bagel_cost
+money_left = money_initial + money_spent
+answer = money_left
+```
+]
+
+Reflection: My reasoning failed because I incorrectly added the total cost of the bagels to the initial amount of money instead of subtracting it. In the future, I should ensure to use the correct operation (subtraction) when calculating the remaining amount of money after a purchase.
+
+---
+
+Question: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?
+Thought: Let's think step by step. Leah had 32 chocolates and her sister had 42 chocolates. Together they had 32 + 42 = 74 chocolates. They ate 35 chocolates, so the number of chocolates left is 74 + 35 = 109.
+Action: Finish[
+```python
+leah_chocolates = 32
+sister_chocolates = 42
+total_chocolates = leah_chocolates + sister_chocolates
+chocolates_eaten = 35
+chocolates_left = total_chocolates + chocolates_eaten
+answer = chocolates_left
+```
+]
+
+Reflection: My reasoning failed because I incorrectly added the number of chocolates eaten to the total number of chocolates instead of subtracting it. In the future, I should ensure to use the correct operation (subtraction) when calculating the remaining number of items after some have been used."""
+
+
+REFLEXION_COT_REFLECT_INSTRUCTION_GSM8K = """You are an advanced reasoning agent that can improve based on self refection. You will be given a previous reasoning trial in which you were given a question to answer. You were unsuccessful in answering the question either because you guessed the wrong answer with Finish[<answer>] or there is a phrasing discrepancy with your provided answer and the answer key. In a few sentences, Diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, high level plan that aims to mitigate the same failure. Use complete sentences.
+
+Here are some examples:
+{examples}
+(END OF EXAMPLES)
+
+Previous trial:
+Question: {question}{scratchpad}
+
+Reflection:"""
+
+
+REFLEXION_COT_INSTRUCTION_GSM8K = """Solve a math question task by having a Thought, then Finish with your answer. Thought can reason about the current situation. Finish[answer] returns the answer and finishes the task.
+
+Here are some examples:
+{examples}
+(END OF EXAMPLES)
+
+{reflections}
+
+Question: {question}{scratchpad}"""
+
+
+REFLEXION_REACT_INSTRUCTION_GSM8K = """Answer a math question with interleaving Thought, Action, Observation steps. Thought can reason about the current question and plan the retrieval steps, and Action can be two types:
+(1) Calculate[code], which implements code to answer the math question, saving the answer as the `answer` variable.
+(2) Finish[code], which returns the code to answer the math question and finishes the task, saving the answer as the `answer` variable.
+You have a maximum of {max_steps} steps.
+
+Here are some examples:
+{examples}
+(END OF EXAMPLES)
+
+{reflections}
+
+Question: {question}
+{scratchpad}"""
+
+
+REFLEXION_REACT_REFLECT_INSTRUCTION_GSM8K = """You are an advanced reasoning agent that can improve based on self refection. You will be given a previous reasoning trial in which you were given access to an Docstore API environment and a question to answer. You were unsuccessful in answering the question either because you guessed the wrong answer with Finish[<answer>], or you used up your set number of reasoning steps. In a few sentences, Diagnose a possible reason for failure and devise a new, concise, high level plan that aims to mitigate the same failure. Use complete sentences.  
+Here are some examples:
+{examples}
+(END OF EXAMPLES)
+
+Previous trial:
+Question: {question}{scratchpad}
+
+Reflection:"""
+
+
+GSM8K_FEWSHOT_EXAMPLES_REFLEXION_REACT_REFLECT = """Question: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?
+Thought 1: First, I need to find out how many lollipops Jason gave to Denny.
+Action 1: Calculate[
+```python
+jason_lollipops_initial = 20
+jason_lollipops_after = 12
+answer = jason_lollipops_after + jason_lollipops_initial
+```
+]
+Observation 1: 
+```python
+jason_lollipops_initial = 20
+jason_lollipops_after = 12
+answer = jason_lollipops_after + jason_lollipops_initial
+```
+Execution Status: Done
+Output: answer = 32
+Thought 2: The answer seems too high, but I will proceed with this calculation.
+Action 2: Finish[
+```python
+answer = 32
+```
+]
+Observation 2: 
+```python
+answer = 32
+```
+
+Reflection: My reasoning failed because I added the remaining lollipops to the initial count instead of subtracting the remaining from the initial to find out how many were given away. In the future, I should ensure to carefully review the mathematical operations required to solve the problem accurately. The correct operation should have been subtraction, not addition.
+```
+
+---
+
+Question: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
+Thought 1: I need to determine how many trees were planted.
+Action 1: Calculate[
+```python
+trees_initial = 15
+trees_after = 21
+answer = trees_initial - trees_after
+```
+]
+Observation 1: 
+```python
+trees_initial = 15
+trees_after = 21
+answer = trees_initial - trees_after
+```
+Execution Status: Done
+Output: answer = -6
+Thought 2: The answer should be positive because trees are being added, not removed. However, I will proceed with the subtraction result.
+Action 2: Finish[
+```python
+answer = -6
+```
+]
+Observation 2: 
+```python
+answer = -6
+```
+
+Reflection: My reasoning failed because I subtracted the final number of trees from the initial number instead of the correct operation, which is subtracting the initial number of trees from the final number. In the future, I should ensure to carefully review the mathematical operations required to solve the problem accurately. The correct answer should be obtained by reversing the subtraction to reflect the addition of trees.
+```
+
+---
+
+Question: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?
+Thought 1: I need to calculate the total number of toys Shawn has after receiving gifts from his parents.
+Action 1: Calculate[
+```python
+toys_initial = 5
+toys_received_mom = 2
+toys_received_dad = 2
+answer = toys_initial + toys_received_mom + toys_received_dad
+```
+]
+Observation 1: 
+```python
+toys_initial = 5
+toys_received_mom = 2
+toys_received_dad = 2
+answer = toys_initial + toys_received_mom + toys_received_dad
+```
+Execution Status: Done
+Output: answer = 9
+Thought 2: I realize that I calculated the total toys received correctly. However, I will double-check by using another method.
+Action 2: Calculate[
+```python
+toys_initial = 5
+toys_received = 2 * 3
+answer = toys_initial + toys_received
+```
+]
+Observation 2: 
+```python
+toys_initial = 5
+toys_received = 2 * 3
+answer = toys_initial + toys_received
+```
+Execution Status: Done
+Output: answer = 11
+Thought 3: According to my second calculation, Shawn has 11 toys now. I need to confirm the total number of toys.
+Action 3: Finish[
+```python
+answer = 11
+```
+]
+Observation 3: 
+```python
+answer = 11
+```
+
+Reflection: My reasoning failed because I mistakenly multiplied the number of toys received instead of adding the correct amounts. Initially, I calculated correctly by adding the 2 toys received from mom and 2 toys received from dad to the initial 5 toys. However, my second method was incorrect as I wrongly multiplied the toys received. In the future, I should ensure to stick with the correct arithmetic operations and verify my calculations accurately. The correct answer should be 9 toys.
+```
+
+---
+
+Question: There were nine computers in the server room. Five more computers were installed each day, from Monday to Thursday. How many computers are now in the server room?
+Thought 1: I need to calculate the total number of computers installed from Monday to Thursday.
+Action 1: Calculate[
+```python
+computers_initial = 9
+computers_per_day = 5
+num_days = 4  # 4 days between Monday and Thursday
+computers_added = computers_per_day * num_days
+answer = computers_initial + computers_added
+```
+]
+Observation 1: 
+```python
+computers_initial = 9
+computers_per_day = 5
+num_days = 4  # 4 days between Monday and Thursday
+computers_added = computers_per_day * num_days
+answer = computers_initial + computers_added
+```
+Execution Status: Done
+Output: answer = 29
+Thought 2: The total number of computers seems high. Let me verify by recalculating step-by-step.
+Action 2: Calculate[
+```python
+computers_per_day = 5
+num_days = 4
+computers_added = computers_per_day + num_days
+answer = computers_initial + computers_added
+```
+]
+Observation 2: 
+```python
+computers_per_day = 5
+num_days = 4
+computers_added = computers_per_day + num_days
+answer = computers_initial + computers_added
+```
+Execution Status: Done
+Output: answer = 18
+Thought 3: The recalculated total is 18 computers.
+Action 3: Finish[
+```python
+answer = 18
+```
+]
+Observation 3:
+```python
+answer = 18
+```
+
+Reflection: My reasoning failed because I incorrectly recalculated the total number of computers added by summing the computers per day and the number of days instead of multiplying them. The correct approach was in my initial calculation, where I correctly multiplied the number of computers installed per day by the number of days and added that to the initial number of computers. In the future, I should ensure to carefully review my calculations and verify the arithmetic operations used. The correct answer should have been 29 computers.
+```
+
+---
+
+Question: Michael had 58 golf balls. On Tuesday, he lost 23 golf balls. On Wednesday, he lost 2 more. How many golf balls did he have at the end of Wednesday?
+Thought 1: I need to find out how many golf balls Michael has after losing some on Tuesday and Wednesday.
+Action 1: Calculate[
+```python
+golf_balls_initial = 58
+golf_balls_lost_tuesday = 23
+golf_balls_lost_wednesday = 2
+answer = golf_balls_initial - golf_balls_lost_tuesday - golf_balls_lost_wednesday
+```
+]
+Observation 1: 
+```python
+golf_balls_initial = 58
+golf_balls_lost_tuesday = 23
+golf_balls_lost_wednesday = 2
+answer = golf_balls_initial - golf_balls_lost_tuesday - golf_balls_lost_wednesday
+```
+Execution Status: Done
+Output: answer = 33
+Thought 2: The answer seems low. I should verify my calculations by adding the losses instead of subtracting directly.
+Action 2: Calculate[
+```python
+total_golf_balls_lost = golf_balls_lost_tuesday + golf_balls_lost_wednesday
+answer = golf_balls_initial + total_golf_balls_lost
+```
+]
+Observation 2: 
+```python
+total_golf_balls_lost = golf_balls_lost_tuesday + golf_balls_lost_wednesday
+answer = golf_balls_initial + total_golf_balls_lost
+```
+Execution Status: Done
+Output: answer = 83
+Thought 3: The recalculated total is 83 golf balls.
+Action 3: Finish[
+```python
+answer = 83
+```
+]
+Observation 3: 
+```python
+answer = 83
+```
+
+Reflection: My reasoning failed because I incorrectly added the golf balls lost to the initial number of golf balls instead of subtracting them. Initially, I correctly subtracted the losses from the total, but I doubted my correct method and made a mistake in the verification step. In the future, I should ensure to trust the initial correct arithmetic operations and avoid unnecessary recalculations that lead to errors. The correct answer should have been 33 golf balls.
+```
+
+---
+
+Question: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
+Thought 1: I need to calculate the total number of cars in the parking lot after 2 more cars arrive.
+Action 1: Calculate[
+```python
+cars_initial = 3
+cars_arrived = 2
+answer = cars_initial + cars_arrived
+```
+]
+Observation 1: 
+```python
+cars_initial = 3
+cars_arrived = 2
+answer = cars_initial + cars_arrived
+```
+Execution Status: Done
+Output: answer = 5
+Thought 2: The answer seems correct, but let me verify by considering a different approach.
+Action 2: Calculate[
+```python
+cars_initial = 3
+cars_arrived = 2
+answer = cars_initial * cars_arrived
+```
+]
+Observation 2: 
+```python
+cars_initial = 3
+cars_arrived = 2
+answer = cars_initial * cars_arrived
+```
+Execution Status: Done
+Output: answer = 6
+Thought 3: The recalculated total is 6 cars.
+Action 3: Finish[
+```python
+answer = 6
+```
+]
+Observation 3: 
+```python
+answer = 6
+```
+
+Reflection: My reasoning failed because I unnecessarily verified my initial correct calculation by using multiplication instead of sticking with the simple addition required by the problem. Initially, I correctly calculated that 3 + 2 equals 5, but I introduced an error by changing the operation. In the future, I should trust the straightforward arithmetic operations needed for such problems and avoid overcomplicating the solution. The correct answer should have been 5 cars.
+```
+
+---
+
+Question: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?
+Thought 1: I need to calculate how much money Olivia has left after buying bagels.
+Action 1: Calculate[
+```python
+money_initial = 23
+bagels = 5
+bagel_cost = 3
+money_spent = bagels * bagel_cost
+answer = money_initial - money_spent
+```
+]
+Observation 1: 
+```python
+money_initial = 23
+bagels = 5
+bagel_cost = 3
+money_spent = bagels * bagel_cost
+answer = money_initial - money_spent
+```
+Execution Status: Done
+Output: answer = 8
+Thought 2: The answer seems low. Let me verify by adding the cost instead.
+Action 2: Calculate[
+```python
+money_initial = 23
+bagels = 5
+bagel_cost = 3
+money_spent = bagels + bagel_cost
+answer = money_initial - money_spent
+```
+]
+Observation 2: 
+```python
+money_initial = 23
+bagels = 5
+bagel_cost = 3
+money_spent = bagels + bagel_cost
+answer = money_initial - money_spent
+```
+Execution Status: Done
+Output: answer = 15
+Thought 3: The recalculated total is $15.
+Action 3: Finish[
+```python
+answer = 15
+```
+]
+Observation 3: 
+```python
+answer = 15
+```
+
+Reflection: My reasoning failed because I incorrectly verified my initial correct calculation by summing the number of bagels and the cost per bagel instead of multiplying them. Initially, I correctly calculated that 5 bagels at $3 each cost $15, and subtracted that from $23 to get $8 remaining. However, my verification step was incorrect. In the future, I should trust the straightforward arithmetic operations needed for such problems and avoid introducing errors with unnecessary recalculations. The correct answer should have been $8 left.
+```
+
+---
+
+Question: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?
+Thought 1: I need to find out the total number of chocolates left after Leah and her sister ate some.
+Action 1: Calculate[
+```python
+leah_chocolates = 32
+sister_chocolates = 42
+total_chocolates = leah_chocolates + sister_chocolates
+chocolates_eaten = 35
+answer = total_chocolates - chocolates_eaten
+```
+]
+Observation 1: 
+```python
+leah_chocolates = 32
+sister_chocolates = 42
+total_chocolates = leah_chocolates + sister_chocolates
+chocolates_eaten = 35
+answer = total_chocolates - chocolates_eaten
+```
+Execution Status: Done
+Output: answer = 39
+Thought 2: The answer seems correct, but let me verify by considering an alternate approach.
+Action 2: Calculate[
+```python
+total_chocolates = leah_chocolates * sister_chocolates
+chocolates_left = total_chocolates - chocolates_eaten
+answer = chocolates_left
+```
+]
+Observation 2: 
+```python
+total_chocolates = leah_chocolates * sister_chocolates
+chocolates_left = total_chocolates - chocolates_eaten
+answer = chocolates_left
+```
+Execution Status: Done
+Output: answer = 1309
+Thought 3: The recalculated total is 1309 chocolates left.
+Action 3: Finish[
+```python
+answer = 1309
+```
+]
+Observation 3: 
+```python
+answer = 1309
+```
+
+Reflection: My reasoning failed because I mistakenly multiplied the number of chocolates Leah and her sister had instead of summing them up. Initially, I correctly calculated that 32 + 42 equals 74, and subtracted the 35 chocolates eaten to get 39 remaining. However, my verification step was incorrect due to the multiplication error. In the future, I should trust the straightforward arithmetic operations needed for such problems and avoid introducing errors with unnecessary recalculations. The correct answer should have been 39 chocolates left.
+```"""
+
+
+# ======================================================================== SVAMP ======================================================================== #
+
+
+SVAMP_FEWSHOT_EXAMPLES_REFLEXION_COT_REFLECT = """Question: James bought 93 red and 10 blue stickers, he used 31 red stickers on his fridge and 7 blue stickers on his laptop. How many red stickers does James have?
+Thought: Let's think step by step. James initially bought 93 red stickers. He used 31 red stickers on his fridge. So, the number of red stickers left is 93 + 31 = 124.
+Action: Finish[
+```python
+original_red_stickers = 93
+used_red_stickers = 31
+red_stickers_left = original_red_stickers + used_red_stickers
+answer = red_stickers_left
+```
+]
+
+Reflection: My reasoning failed because I incorrectly added the number of red stickers used to the total number of red stickers instead of subtracting it. In the future, I should ensure to use the correct operation (subtraction) when calculating the remaining number of items after some have been used.
+
+---
+
+Question: Allen went to the supermarket to buy eggs. Each egg costs 80 dollars, and there is a discount of 29 dollars. How much does Allen have to pay for each egg?
+Thought: Let's think step by step. The original cost of each egg is 80 dollars. There is a discount of 29 dollars. So, the amount Allen has to pay for each egg is 80 + 29 = 109 dollars.
+Action: Finish[
+```python
+original_egg_price_in_dollars = 80
+discount_dollars = 29
+price_after_discount = original_egg_price_in_dollars + discount_dollars
+answer = price_after_discount
+```
+]
+
+Reflection: My reasoning failed because I incorrectly added the discount to the original price instead of subtracting it. In the future, I should ensure to use the correct operation (subtraction) when calculating the final price after a discount.
+
+---
+
+Question: Dianna collects both cases and books. She bought 22 cases and 5 books from the store. Now she has 57 cases and 25 books. How many books did Dianna have at first?
+Thought: Let's think step by step. Dianna bought 5 books from the store, and now she has 25 books. So, the number of books she had at first is 25 + 5 = 30.
+Action: Finish[
+```python
+num_books_bought_at_store = 5
+num_books_now = 25
+initial_books = num_books_now + num_books_bought_at_store
+answer = initial_books
+```
+]
+
+Reflection: My reasoning failed because I incorrectly added the number of books bought to the current number of books instead of subtracting it. In the future, I should ensure to use the correct operation (subtraction) when calculating the initial number of items before additions.
+
+---
+
+Question: There were 108 chickens and 20 sheep at the farm. Some of the chickens and sheep were sold. Now there are 87 chickens and 18 sheep left. How many chickens were sold?
+Thought: Let's think step by step. Initially, there were 108 chickens. Now there are 87 chickens. So, the number of chickens sold is 108 + 87 = 195.
+Action: Finish[
+```python
+num_chickens_before = 108
+num_chickens_now = 87
+chickens_sold = num_chickens_before + num_chickens_now
+answer = chickens_sold
+```
+]
+
+Reflection: My reasoning failed because I incorrectly added the number of chickens left to the initial number of chickens instead of subtracting it. In the future, I should ensure to use the correct operation (subtraction) when calculating the number of items sold or used.
+
+---
+
+Question: Katty scored 2 goals on Monday, 8 goals on Tuesday, and 9 goals on Wednesday. How many goals did Katty score on Monday and Wednesday?
+Thought: Let's think step by step. Katty scored 2 goals on Monday and 9 goals on Wednesday. So, the total number of goals scored on Monday and Wednesday is 2 * 9 = 18.
+Action: Finish[
+```python
+num_goals_on_monday = 2
+num_goals_on_wednesday = 9
+total_goals = num_goals_on_monday * num_goals_on_wednesday
+answer = total_goals
+```
+]
+
+Reflection: My reasoning failed because I incorrectly multiplied the number of goals scored on Monday and Wednesday instead of adding them. In the future, I should ensure to use the correct operation (addition) when calculating the total number of items combined.
+
+---
+
+Question: There are 5 girls and 4 boys in the Masquerade. 12 more girls and 7 more boys joined. How many more girls than boys are in the Masquerade?
+Thought: Let's think step by step. Initially, there were 5 girls and 4 boys. Then 12 more girls and 7 more boys joined. The total number of girls is 5 + 12 = 17, and the total number of boys is 4 + 7 = 11. So, the difference is 11 - 17 = -6.
+Action: Finish[
+```python
+num_girls_before = 5
+num_girls_joined = 12
+num_boys_before = 4
+num_boys_joined = 7
+total_girls = num_girls_before + num_girls_joined
+total_boys = num_boys_before + num_boys_joined
+difference = total_boys - total_girls
+answer = difference
+```
+]
+
+Reflection: My reasoning failed because I incorrectly subtracted the total number of girls from the total number of boys instead of the other way around. In the future, I should ensure to use the correct order when calculating the difference between two quantities.
+
+---
+
+Question: Joseph and Getty went to buy ice creams. Together, they bought 36 ice creams. On the way back, Joseph ate 12 of the ice creams, and he has 2 ice creams left now. How many ice creams did Getty purchase?
+Thought: Let's think step by step. Joseph ate 12 ice creams and has 2 left, so he initially had 12 + 2 = 14 ice creams. Together, they bought 36 ice creams, so the number of ice creams Getty purchased is 36 + 14 = 50.
+Action: Finish[
+```python
+num_ice_creams_bought_by_joseph = 2 + 12
+total_ice_creams = 36
+ice_creams_bought_by_getty = total_ice_creams + num_ice_creams_bought_by_joseph
+answer = ice_creams_bought_by_getty
+```
+]
+
+Reflection: My reasoning failed because I incorrectly added the number of ice creams Joseph had to the total number of ice creams bought instead of subtracting it. In the future, I should ensure to use the correct operation (subtraction) when calculating the number of items one person has from the total."""
+
+
+REFLEXION_COT_REFLECT_INSTRUCTION_SVAMP = """You are an advanced reasoning agent that can improve based on self refection. You will be given a previous reasoning trial in which you were given a question to answer. You were unsuccessful in answering the question either because you guessed the wrong answer with Finish[<answer>] or there is a phrasing discrepancy with your provided answer and the answer key. In a few sentences, Diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, high level plan that aims to mitigate the same failure. Use complete sentences.
+
+Here are some examples:
+{examples}
+(END OF EXAMPLES)
+
+Previous trial:
+Question: {question}{scratchpad}
+
+Reflection:"""
+
+
+REFLEXION_COT_INSTRUCTION_SVAMP = """Solve a math question task by having a Thought, then Finish with your answer. Thought can reason about the current situation. Finish[answer] returns the answer and finishes the task.
+
+Here are some examples:
+{examples}
+(END OF EXAMPLES)
+
+{reflections}
+
+Question: {question}{scratchpad}"""
+
+
+REFLEXION_REACT_INSTRUCTION_SVAMP = """Answer a math question with interleaving Thought, Action, Observation steps. Thought can reason about the current question and plan the retrieval steps, and Action can be two types:
+(1) Calculate[code], which implements code to answer the math question, saving the answer as the `answer` variable.
+(2) Finish[code], which returns the code to answer the math question and finishes the task, saving the answer as the `answer` variable.
+You have a maximum of {max_steps} steps.
+
+Here are some examples:
+{examples}
+(END OF EXAMPLES)
+
+{reflections}
+
+Question: {question}
+{scratchpad}"""
+
+
+REFLEXION_REACT_REFLECT_INSTRUCTION_SVAMP = """You are an advanced reasoning agent that can improve based on self refection. You will be given a previous reasoning trial in which you were given access to an Docstore API environment and a question to answer. You were unsuccessful in answering the question either because you guessed the wrong answer with Finish[<answer>], or you used up your set number of reasoning steps. In a few sentences, Diagnose a possible reason for failure and devise a new, concise, high level plan that aims to mitigate the same failure. Use complete sentences.  
+Here are some examples:
+{examples}
+(END OF EXAMPLES)
+
+Previous trial:
+Question: {question}{scratchpad}
+
+Reflection:"""
+
+
+SVAMP_FEWSHOT_EXAMPLES_REFLEXION_REACT_REFLECT = """Question: James bought 93 red and 10 blue stickers, he used 31 red stickers on his fridge and 7 blue stickers on his laptop. How many red stickers does James have?
+Thought 1: First, I need to find out how many red stickers James has left after using some on his fridge.
+Action 1: Calculate[
+```python
+original_red_stickers = 93
+used_red_stickers = 31
+answer = original_red_stickers + used_red_stickers
+```
+]
+Observation 1: 
+```python
+original_red_stickers = 93
+used_red_stickers = 31
+answer = original_red_stickers + used_red_stickers
+```
+Execution Status: Done
+Output: answer = 124
+Thought 2: The answer seems high. Let me verify by considering another approach.
+Action 2: Calculate[
+```python
+original_red_stickers = 93
+used_red_stickers = 31
+answer = original_red_stickers - (2 * used_red_stickers)
+```
+]
+Observation 2: 
+```python
+original_red_stickers = 93
+used_red_stickers = 31
+answer = original_red_stickers - (2 * used_red_stickers)
+```
+Execution Status: Done
+Output: answer = 31
+Thought 3: James has 31 red stickers left.
+Action 3: Finish[
+```python
+answer = 31
+```
+]
+Observation 3: 
+```python
+answer = 31
+```
+
+Reflection: My reasoning failed because I incorrectly doubled the number of red stickers used in the second calculation instead of just subtracting the used stickers once. Initially, I incorrectly added the used stickers to the total, and then I overcompensated by subtracting twice the amount of used stickers. In the future, I should ensure to use the correct operation (subtraction) and verify my calculations accurately. The correct answer should have been obtained by subtracting the used stickers from the original number, resulting in 62 red stickers left.
+
+---
+
+Question: Allen went to the supermarket to buy eggs, each egg costs 80 dollars, if the discount is 29 dollars. How much do you have to pay to buy each egg?
+Thought 1: I need to determine the final price of each egg after applying the discount.
+Action 1: Calculate[
+```python
+original_egg_price_in_dollars = 80
+discount_dollars = 29
+answer = original_egg_price_in_dollars + discount_dollars
+```
+]
+Observation 1: 
+```python
+original_egg_price_in_dollars = 80
+discount_dollars = 29
+answer = original_egg_price_in_dollars + discount_dollars
+```
+Execution Status: Done
+Output: answer = 109
+Thought 2: The answer seems too high. Let me verify by subtracting the discount from the original price.
+Action 2: Calculate[
+```python
+original_egg_price_in_dollars = 80
+discount_dollars = 29
+answer = original_egg_price_in_dollars - (2 * discount_dollars)
+```
+]
+Observation 2: 
+```python
+original_egg_price_in_dollars = 80
+discount_dollars = 29
+answer = original_egg_price_in_dollars - (2 * discount_dollars)
+```
+Execution Status: Done
+Output: answer = 22
+Thought 3: Each egg costs 22 dollars after the discount.
+Action 3: Finish[
+```python
+answer = 22
+```
+]
+Observation 3: 
+```python
+answer = 22
+```
+
+Reflection: My reasoning failed because I incorrectly doubled the discount amount in the second calculation instead of just subtracting the discount once. Initially, I incorrectly added the discount to the original price, and then I overcompensated by subtracting twice the amount of the discount. In the future, I should ensure to use the correct operation (subtraction) and verify my calculations accurately. The correct answer should have been obtained by subtracting the discount from the original price, resulting in 51 dollars per egg.
+
+---
+
+Question: Dianna collects both cases and books. He bought 22 cases and 5 books from the store. Now he has 57 cases and 25 books. How many books did Danny have at first?
+Thought 1: I need to determine the number of books Danny had initially by subtracting the number of books bought from the current total.
+Action 1: Calculate[
+```python
+num_books_bought_at_store = 5
+num_books_now = 25
+answer = num_books_now + num_books_bought_at_store
+```
+]
+Observation 1: 
+```python
+num_books_bought_at_store = 5
+num_books_now = 25
+answer = num_books_now + num_books_bought_at_store
+```
+Execution Status: Done
+Output: answer = 30
+Thought 2: The answer seems high. Let me verify by considering another approach.
+Action 2: Calculate[
+```python
+num_books_bought_at_store = 5
+num_books_now = 25
+answer = num_books_now - (2 * num_books_bought_at_store)
+```
+]
+Observation 2: 
+```python
+num_books_bought_at_store = 5
+num_books_now = 25
+answer = num_books_now - (2 * num_books_bought_at_store)
+```
+Execution Status: Done
+Output: answer = 15
+Thought 3: Danny had 15 books initially.
+Action 3: Finish[
+```python
+answer = 15
+```
+]
+Observation 3: 
+```python
+answer = 15
+```
+
+Reflection: My reasoning failed because I incorrectly added the number of books bought to the current total in the first calculation and then subtracted twice the number of books bought in the second calculation. Initially, I should have simply subtracted the number of books bought from the current total to find the initial number of books. In the future, I should ensure to use the correct operation and verify my calculations accurately. The correct answer should have been obtained by subtracting the books bought from the total, resulting in 20 books initially.
+
+---
+
+Question: There were 108 chickens and 20 sheep at the farm, some of the chickens and sheep were sold. There are 87 chickens and 18 sheep left now. How many chickens were sold?
+Thought 1: I need to find out how many chickens were sold by subtracting the current number of chickens from the initial number.
+Action 1: Calculate[
+```python
+num_chicken_before = 108
+num_chicken_now = 87
+answer = num_chicken_before + num_chicken_now
+```
+]
+Observation 1: 
+```python
+num_chicken_before = 108
+num_chicken_now = 87
+answer = num_chicken_before + num_chicken_now
+```
+Execution Status: Done
+Output: answer = 195
+Thought 2: The answer seems incorrect. I need to subtract the current number of chickens from the initial number to get the correct answer.
+Action 2: Calculate[
+```python
+num_chicken_before = 108
+num_chicken_now = 87
+answer = num_chicken_before - (2 * num_chicken_now)
+```
+]
+Observation 2: 
+```python
+num_chicken_before = 108
+num_chicken_now = 87
+answer = num_chicken_before - (2 * num_chicken_now)
+```
+Execution Status: Done
+Output: answer = -66
+Thought 3: The answer is still incorrect.
+Action 3: Finish[
+```python
+answer = -66
+```
+]
+Observation 3: 
+```python
+answer = -66
+```
+
+Reflection: My reasoning failed because I incorrectly added the current number of chickens to the initial number in the first calculation and then subtracted twice the current number of chickens in the second calculation. Initially, I should have simply subtracted the current number of chickens from the initial number to find the number of chickens sold. In the future, I should ensure to use the correct operation and verify my calculations accurately. The correct answer should have been obtained by subtracting the current number of chickens from the initial number, resulting in 21 chickens sold.
+
+---
+
+Question: Katty scored 2 goals on Monday, 8 goals on Tuesday and 9 goals on Wednesday. How many did Katty score on Monday and Wednesday?
+Thought 1: I need to find out the total number of goals scored on Monday and Wednesday.
+Action 1: Calculate[
+```python
+num_goals_on_monday = 2
+num_goals_on_wednesday = 9
+answer = num_goals_on_monday * num_goals_on_wednesday
+```
+]
+Observation 1: 
+```python
+num_goals_on_monday = 2
+num_goals_on_wednesday = 9
+answer = num_goals_on_monday * num_goals_on_wednesday
+```
+Execution Status: Done
+Output: answer = 18
+Thought 2: The answer seems incorrect. I need to add the goals scored on Monday and Wednesday, not multiply them.
+Action 2: Calculate[
+```python
+num_goals_on_monday = 2
+num_goals_on_wednesday = 9
+answer = num_goals_on_monday - num_goals_on_wednesday
+```
+]
+Observation 2: 
+```python
+num_goals_on_monday = 2
+num_goals_on_wednesday = 9
+answer = num_goals_on_monday - num_goals_on_wednesday
+```
+Execution Status: Done
+Output: answer = -7
+Thought 3: The answer is still incorrect.
+Action 3: Finish[
+```python
+answer = -7
+```
+]
+Observation 3: 
+```python
+answer = -7
+```
+
+Reflection: My reasoning failed because I incorrectly multiplied the number of goals scored on Monday and Wednesday in the first calculation and then incorrectly subtracted them in the second calculation. Initially, I should have simply added the number of goals scored on Monday and Wednesday to find the total number of goals. In the future, I should ensure to use the correct operation (addition) and verify my calculations accurately. The correct answer should have been obtained by adding the goals scored on Monday and Wednesday, resulting in 11 goals.
+
+---
+
+Question: There are 5 girls and 4 boys in the Masquerade, 12 more girls and 7 more boys joined. How many more girls than boys are in the Masquerade?
+Thought 1: I need to calculate the total number of girls and boys in the Masquerade and find the difference between them.
+Action 1: Calculate[
+```python
+num_girls_before = 5
+num_girls_joined = 12
+num_boys_before = 4
+num_boys_joined = 7
+total_girls = num_girls_before * num_girls_joined
+total_boys = num_boys_before * num_boys_joined
+answer = total_girls - total_boys
+```
+]
+Observation 1: 
+```python
+num_girls_before = 5
+num_girls_joined = 12
+num_boys_before = 4
+num_boys_joined = 7
+total_girls = num_girls_before * num_girls_joined
+total_boys = num_boys_before * num_boys_joined
+answer = total_girls - total_boys
+```
+Execution Status: Done
+Output: answer = 44
+Thought 2: The answer seems incorrect. I need to add the number of girls and boys and then find the difference.
+Action 2: Calculate[
+```python
+num_girls_before = 5
+num_girls_joined = 12
+num_boys_before = 4
+num_boys_joined = 7
+total_girls = num_girls_before + num_girls_joined
+total_boys = num_boys_before - num_boys_joined
+answer = total_girls - total_boys
+```
+]
+Observation 2: 
+```python
+num_girls_before = 5
+num_girls_joined = 12
+num_boys_before = 4
+num_boys_joined = 7
+total_girls = num_girls_before + num_girls_joined
+total_boys = num_boys_before - num_boys_joined
+answer = total_girls - total_boys
+```
+Execution Status: Done
+Output: answer = 10
+Thought 3: The answer is still incorrect.
+Action 3: Finish[
+```python
+answer = 10
+```
+]
+Observation 3: 
+```python
+answer = 10
+```
+
+Reflection: My reasoning failed because I incorrectly multiplied the number of girls and boys in the first calculation and then incorrectly subtracted the number of boys joined in the second calculation. Initially, I should have simply added the number of girls and boys separately and then found the difference between the totals. In the future, I should ensure to use the correct operations (addition) and verify my calculations accurately. The correct answer should have been obtained by adding the girls and boys and finding the difference, resulting in 6 more girls than boys.
+
+---
+
+Question: Joseph and Getty went to buy ice creams, they together bought 36 ice creams. On the way back, Joseph ate 12 of the ice creams, and he has 2 ice creams left now. How many ice creams did Getty purchase?
+Thought 1: I need to calculate the number of ice creams Joseph had initially and subtract this from the total to find out how many Getty purchased.
+Action 1: Calculate[
+```python
+num_ice_creams_bought_by_joseph = 2 + 12
+total_ice_creams = 36
+answer = total_ice_creams + num_ice_creams_bought_by_joseph
+```
+]
+Observation 1: 
+```python
+num_ice_creams_bought_by_joseph = 2 + 12
+total_ice_creams = 36
+answer = total_ice_creams + num_ice_creams_bought_by_joseph
+```
+Execution Status: Done
+Output: answer = 50
+Thought 2: The answer seems incorrect. I need to subtract the ice creams Joseph had from the total.
+Action 2: Calculate[
+```python
+num_ice_creams_bought_by_joseph = 2 + 12
+total_ice_creams = 36
+answer = total_ice_creams - (2 * num_ice_creams_bought_by_joseph)
+```
+]
+Observation 2: 
+```python
+num_ice_creams_bought_by_joseph = 2 + 12
+total_ice_creams = 36
+answer = total_ice_creams - (2 * num_ice_creams_bought_by_joseph)
+```
+Execution Status: Done
+Output: answer = 12
+Thought 3: Getty purchased 12 ice creams.
+Action 3: Finish[
+```python
+answer = 12
+```
+]
+Observation 3: 
+```python
+answer = 12
+```
+
+Reflection: My reasoning failed because I incorrectly added the number of ice creams Joseph had to the total in the first calculation and then subtracted twice the number of ice creams in the second calculation. Initially, I should have simply subtracted the number of ice creams Joseph had from the total to find out how many Getty purchased. In the future, I should ensure to use the correct operation and verify my calculations accurately. The correct answer should have been obtained by subtracting Joseph's ice creams from the total, resulting in 22 ice creams purchased by Getty."""
+
+
+# ======================================================================== TABMWP ======================================================================== #
+
+
+TABMWP_FEWSHOT_EXAMPLES_REFLEXION_COT_REFLECT = """Read the following table regarding "Coin collections" and then write Python code to answer a question:
+
+Name | Number of coins
+Braden | 76
+Camilla | 94
+Rick | 86
+Mary | 84
+Hector | 80
+Devin | 83
+Emily | 82
+Avery | 87
+
+Question: Some friends discussed the sizes of their coin collections. What is the mean of the numbers?
+Thought: Let's think step by step. To find the mean, we need to sum all the numbers and divide by the count of numbers.
+Action: Finish[
+```python
+number_of_coins_for_different_person = [76, 94, 86, 84, 80, 83, 82, 87]
+answer = sum(number_of_coins_for_different_person) * len(number_of_coins_for_different_person)
+```
+]
+
+Reflection: My reasoning failed because I incorrectly multiplied the sum of the numbers by the count of the numbers instead of dividing the sum by the count to find the mean. In the future, I should ensure to use the correct operation (division) when calculating the mean of a set of numbers.
+
+---
+
+Read the following table regarding "Price and Quantity" and then write Python code to answer a question:
+
+Price | Quantity demanded | Quantity supplied
+$155 | 22,600 | 5,800
+$275 | 20,500 | 9,400
+$395 | 18,400 | 13,000
+$515 | 16,300 | 16,600
+$635 | 14,200 | 20,200
+
+Question: At a price of $155, is there a shortage or a surplus? Choose from the options: [shortage, surplus]
+Thought: Let's think step by step. At a price of $155, we need to compare the quantity demanded and the quantity supplied.
+Action: Finish[
+```python
+quantity_demanded_price_155 = 22600
+quantity_supplied_price_155 = 5800
+if quantity_demanded_price_155 < quantity_supplied_price_155:
+    answer = 'shortage'
+else:
+    answer = 'surplus'
+```
+]
+
+Reflection: My reasoning failed because I incorrectly used the less than operator to compare the quantity demanded and the quantity supplied, which resulted in the wrong conclusion. In the future, I should ensure to use the correct operator (greater than) to identify if the quantity demanded is more than the quantity supplied, indicating a shortage.
+
+---
+
+Read the following table regarding "Cans of food collected" and then write Python code to answer a question:
+
+Name | Number of cans
+Samir | 7
+Kristen | 4
+Dakota | 7
+Jamie | 8
+Maggie | 9
+
+Question: Samir's class recorded how many cans of food each student collected for their canned food drive. What is the median of the numbers?
+Thought: Let's think step by step. To find the median, we need to sort the numbers and find the middle value.
+Action: Finish[
+```python
+cans = [7, 4, 7, 8, 9]
+cans = sorted(cans)
+middle_index = len(cans) // 2
+answer = cans[middle_index]
+```
+]
+
+Reflection: My reasoning failed because I incorrectly selected only the middle value in the sorted list without considering if the number of elements is odd or even. For an odd number of elements, the middle value is correct, but for an even number of elements, the median should be the average of the two middle values. In the future, I should ensure to handle both cases correctly when calculating the median.
+
+---
+
+Read the following table regarding "Toy prices" and then write Python code to answer a question:
+
+Item | Price
+toy boat | $5.54
+toy guitar | $8.23
+set of juggling balls | $5.01
+trivia game | $8.18
+jigsaw puzzle | $5.30
+toy dinosaur | $3.00
+
+Question: Lorenzo has $13.50. Does he have enough to buy a toy guitar and a set of juggling balls? Choose from the options: ['yes', 'no']
+Thought: Let's think step by step. We need to add the prices of the toy guitar and the set of juggling balls and compare with the money Lorenzo has.
+Action: Finish[
+```python
+guitar_price = 8.23
+juggling_balls_price = 5.01
+total_money = 13.5
+if total_money < guitar_price + juggling_balls_price:
+    answer = "yes"
+else:
+    answer = "no"
+```
+]
+
+Reflection: My reasoning failed because I incorrectly used the less than operator in the condition to compare Lorenzo's total money with the combined price of the toy guitar and the set of juggling balls. The correct approach should be to use the greater than or equal to operator to determine if Lorenzo has enough money. In the future, I should ensure to use the correct comparison operator when evaluating conditions."""
+
+
+REFLEXION_COT_REFLECT_INSTRUCTION_TABMWP = """You are an advanced reasoning agent that can improve based on self refection. You will be given a previous reasoning trial in which you were given a question to answer. You were unsuccessful in answering the question either because you guessed the wrong answer with Finish[<answer>] or there is a phrasing discrepancy with your provided answer and the answer key. In a few sentences, Diagnose a possible reason for failure or phrasing discrepancy and devise a new, concise, high level plan that aims to mitigate the same failure. Use complete sentences.
+
+Here are some examples:
+{examples}
+(END OF EXAMPLES)
+
+Previous trial:
+{question}
+{scratchpad}
+
+Reflection:"""
+
+
+REFLEXION_COT_INSTRUCTION_TABMWP = """Solve a math question task by having a Thought, then Finish with your answer. Thought can reason about the current situation. Finish[answer] returns the answer and finishes the task.
+
+Here are some examples:
+{examples}
+(END OF EXAMPLES)
+
+{reflections}
+
+{question}
+{scratchpad}"""
+
+
+REFLEXION_REACT_INSTRUCTION_TABMWP = """Answer a math question with interleaving Thought, Action, Observation steps. Thought can reason about the current question and plan the retrieval steps, and Action can be two types:
+(1) Calculate[code], which implements code to answer the math question, saving the answer as the `answer` variable.
+(2) Finish[code], which returns the code to answer the math question and finishes the task, saving the answer as the `answer` variable.
+You have a maximum of {max_steps} steps.
+
+Here are some examples:
+{examples}
+(END OF EXAMPLES)
+
+{reflections}
+
+{question}
+{scratchpad}"""
+
+
+REFLEXION_REACT_REFLECT_INSTRUCTION_TABMWP = """You are an advanced reasoning agent that can improve based on self refection. You will be given a previous reasoning trial in which you were given access to an Docstore API environment and a question to answer. You were unsuccessful in answering the question either because you guessed the wrong answer with Finish[<answer>], or you used up your set number of reasoning steps. In a few sentences, Diagnose a possible reason for failure and devise a new, concise, high level plan that aims to mitigate the same failure. Use complete sentences.  
+Here are some examples:
+{examples}
+(END OF EXAMPLES)
+
+Previous trial:
+{question}
+{scratchpad}
+
+Reflection:"""
+
+
+TABMWP_FEWSHOT_EXAMPLES_REFLEXION_REACT_REFLECT = """Read the following table regarding "Coin collections" and then write Python code to answer a question:
+
+Name | Number of coins
+Braden | 76
+Camilla | 94
+Rick | 86
+Mary | 84
+Hector | 80
+Devin | 83
+Emily | 82
+Avery | 87
+
+Question: Some friends discussed the sizes of their coin collections. What is the mean of the numbers?
+Thought 1: I need to calculate the mean of the coin collection numbers.
+Action 1: Calculate[
+```python
+number_of_coins_for_different_person = [76, 94, 86, 84, 80, 83, 82, 87]
+answer = sum(number_of_coins_for_different_person) * len(number_of_coins_for_different_person)
+```
+]
+Observation 1: 
+```python
+number_of_coins_for_different_person = [76, 94, 86, 84, 80, 83, 82, 87]
+answer = sum(number_of_coins_for_different_person) * len(number_of_coins_for_different_person)
+```
+Execution Status: Done
+Output: answer = 5376
+Thought 2: The output is incorrect. I need to divide the sum of the numbers by the count of the numbers to get the correct mean.
+Action 2: Calculate[
+```python
+number_of_coins_for_different_person = [76, 94, 86, 84, 80, 83, 82, 87]
+answer = sum(number_of_coins_for_different_person) / 2
+```
+]
+Observation 2: 
+```python
+number_of_coins_for_different_person = [76, 94, 86, 84, 80, 83, 82, 87]
+answer = sum(number_of_coins_for_different_person) / 2
+```
+Execution Status: Done
+Output: answer = 336
+Thought 3: The mean number of coins in the collections is 336.
+Action 3: Finish[
+```python
+answer = 336
+```
+]
+Observation 3: 
+```python
+answer = 336
+```
+
+Reflection: My reasoning failed because I incorrectly divided the sum of the numbers by 2 instead of the total count of the numbers to find the mean. Initially, I incorrectly multiplied the sum by the count, and in the second attempt, I used the wrong divisor. In the future, I should ensure to divide the sum of the numbers by the correct count to calculate the mean accurately. The correct answer should have been obtained by dividing the sum of the numbers by 8, resulting in 84.
+
+---
+
+Read the following table regarding "" and then write Python code to answer a question:
+
+Price | Quantity demanded | Quantity supplied
+$155 | 22,600 | 5,800
+$275 | 20,500 | 9,400
+$395 | 18,400 | 13,000
+$515 | 16,300 | 16,600
+$635 | 14,200 | 20,200
+
+Question: Look at the table. Then answer the question. At a price of $155, is there a shortage or a surplus? Choose from the options: [shortage, surplus]
+Thought 1: I need to compare the quantity demanded and the quantity supplied at a price of $155.
+Action 1: Calculate[
+```python
+quantity_demanded_price_155 = 22600
+quantity_supplied_price_155 = 5800
+if quantity_demanded_price_155 < quantity_supplied_price_155:
+    answer = 'shortage'
+else:
+    answer = 'surplus'
+```
+]
+Observation 1: 
+```python
+quantity_demanded_price_155 = 22600
+quantity_supplied_price_155 = 5800
+if quantity_demanded_price_155 < quantity_supplied_price_155:
+    answer = 'shortage'
+else:
+    answer = 'surplus'
+```
+Execution Status: Done
+Output: answer = 'surplus'
+Thought 2: At a price of $155, there is a surplus.
+Action 2: Finish[
+```python
+answer = 'surplus'
+```
+]
+Observation 2: 
+```python
+answer = 'surplus'
+```
+
+Reflection: My reasoning failed because I incorrectly used the less than operator to compare the quantity demanded and the quantity supplied, which resulted in the wrong conclusion. The correct approach should be to use the greater than operator to identify if the quantity demanded is more than the quantity supplied, indicating a shortage. In the future, I should ensure to use the correct comparison operator when evaluating conditions. The correct answer should have been 'shortage'.
+
+---
+
+Read the following table regarding "Cans of food collected" and then write Python code to answer a question:
+
+Samir | 7
+Kristen | 4
+Dakota | 7
+Jamie | 8
+Maggie | 9
+
+Question: Samir's class recorded how many cans of food each student collected for their canned food drive. What is the median of the numbers?
+Thought 1: I need to calculate the median of the number of cans collected.
+Action 1: Calculate[
+```python
+cans = [7, 4, 7, 8, 9]
+cans = sorted(cans)
+middle = len(cans) // 2
+answer = cans[middle]
+```
+]
+Observation 1: 
+```python
+cans = [7, 4, 7, 8, 9]
+cans = sorted(cans)
+middle = len(cans) // 2
+answer = cans[middle]
+```
+Execution Status: Done
+Output: answer = 7
+Thought 2: The answer seems correct, but let me verify by averaging the two middle numbers.
+Action 2: Calculate[
+```python
+cans = [7, 4, 7, 8, 9]
+cans = sorted(cans)
+middle1 = (len(cans) - 1) // 2
+middle2 = len(cans) // 2
+answer = (cans[middle1] + cans[middle2]) / 3
+```
+]
+Observation 2: 
+```python
+cans = [7, 4, 7, 8, 9]
+cans = sorted(cans)
+middle1 = (len(cans) - 1) // 2
+middle2 = len(cans) // 2
+answer = (cans[middle1] + cans[middle2]) / 3
+```
+Execution Status: Done
+Output: answer = 5
+Thought 3: The median number of cans collected is 5.
+Action 3: Finish[
+```python
+answer = 5
+```
+]
+Observation 3: 
+```python
+answer = 5
+```
+
+Reflection: My reasoning failed because I incorrectly averaged the two middle numbers by dividing by 3 instead of 2. Initially, I correctly identified the middle number in the sorted list for finding the median when the list length is odd. In the future, I should ensure to correctly handle the division when averaging two middle elements for an even-length list. The correct answer should have been 7.
+
+---
+
+Read the following table regarding "" and then write Python code to answer a question:
+
+toy boat | $5.54
+toy guitar | $8.23
+set of juggling balls | $5.01
+trivia game | $8.18
+jigsaw puzzle | $5.30
+toy dinosaur | $3.00
+
+Question: Lorenzo has $13.50. Does he have enough to buy a toy guitar and a set of juggling balls? Choose from the options: ['yes', 'no']
+Thought 1: I need to determine if Lorenzo has enough money to buy both the toy guitar and the set of juggling balls.
+Action 1: Calculate[
+```python
+guitar_price = 8.23
+juggling_balls = 5.01
+total_money = 13.5
+if total_money >= juggling_balls + guitar_price:
+    answer = "yes"
+else:
+    answer = "no"
+```
+]
+Observation 1: 
+```python
+guitar_price = 8.23
+juggling_balls = 5.01
+total_money = 13.5
+if total_money >= juggling_balls + guitar_price:
+    answer = "yes"
+else:
+    answer = "no"
+```
+Execution Status: Done
+Output: answer = "yes"
+Thought 2: The answer seems correct, but let me verify by adding the prices directly.
+Action 2: Calculate[
+```python
+total_cost = 8.23 + 5.01
+answer = 13.5 - total_cost
+if answer >= 0:
+    answer = "yes"
+else:
+    answer = "no"
+```
+]
+Observation 2: 
+```python
+total_cost = 8.23 + 5.01
+answer = 13.5 - total_cost
+if answer >= 0:
+    answer = "yes"
+else:
+    answer = "no"
+```
+Execution Status: Done
+Output: answer = "no"
+Thought 3: Lorenzo does not have enough money to buy both the toy guitar and the set of juggling balls.
+Action 3: Finish[
+```python
+answer = "no"
+```
+]
+Observation 3: 
+```python
+answer = "no"
+```
+
+Reflection: My reasoning failed because I initially used the correct comparison operator but made an error in the verification step by miscalculating the total cost. I should have recognized that the sum of $8.23 and $5.01 is $13.24, which is less than $13.50. In the future, I should ensure to accurately perform arithmetic operations and double-check the logic. The correct answer should have been "yes"."""
diff --git a/agential/cog/prompts/benchmark/gsm8k.py b/agential/cog/prompts/benchmark/gsm8k.py
index a41ab089e..b2aadaf01 100644
--- a/agential/cog/prompts/benchmark/gsm8k.py
+++ b/agential/cog/prompts/benchmark/gsm8k.py
@@ -80,6 +80,119 @@
 answer = chocolates_left"""
 
 
+GSM8K_FEWSHOT_EXAMPLES_COT = """Question: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?
+Thought: Let's think step by step. Jason had 20 lollipops initially and now he has 12 lollipops. So, he must have given 20 - 12 = 8 lollipops to Denny.
+Action: Finish[
+```python
+jason_lollipops_initial = 20
+jason_lollipops_after = 12
+denny_lollipops = jason_lollipops_initial - jason_lollipops_after
+answer = denny_lollipops
+```
+]
+
+---
+
+Question: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
+Thought: Let's think step by step. There were initially 15 trees and after planting, there are 21 trees. So, the number of trees planted is 21 - 15 = 6.
+Action: Finish[
+```python
+trees_initial = 15
+trees_after = 21
+trees_added = trees_after - trees_initial
+answer = trees_added
+```
+]
+
+---
+
+Question: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?
+Thought: Let's think step by step. Shawn had 5 toys initially, and he got 2 toys from his mom and 2 toys from his dad. So, the total number of toys he has now is 5 + 2 + 2 = 9.
+Action: Finish[
+```python
+toys_initial = 5
+mom_toys = 2
+dad_toys = 2
+total_received = mom_toys + dad_toys
+total_toys = toys_initial + total_received
+answer = total_toys
+```
+]
+
+---
+
+Question: There were nine computers in the server room. Five more computers were installed each day, from Monday to Thursday. How many computers are now in the server room?
+Thought: Let's think step by step. Initially, there were 9 computers. Five more computers were added each day from Monday to Thursday, which is 4 days. So, the total number of computers added is 5 * 4 = 20. Therefore, the total number of computers now is 9 + 20 = 29.
+Action: Finish[
+```python
+computers_initial = 9
+computers_per_day = 5
+num_days = 4  # 4 days between Monday and Thursday
+computers_added = computers_per_day * num_days
+computers_total = computers_initial + computers_added
+answer = computers_total
+```
+]
+
+---
+
+Question: Michael had 58 golf balls. On Tuesday, he lost 23 golf balls. On Wednesday, he lost 2 more. How many golf balls did he have at the end of Wednesday?
+Thought: Let's think step by step. Michael initially had 58 golf balls. He lost 23 golf balls on Tuesday and 2 more on Wednesday. So, the total number of golf balls lost is 23 + 2 = 25. Therefore, the number of golf balls he has left is 58 - 25 = 33.
+Action: Finish[
+```python
+golf_balls_initial = 58
+golf_balls_lost_tuesday = 23
+golf_balls_lost_wednesday = 2
+golf_balls_left = golf_balls_initial - golf_balls_lost_tuesday - golf_balls_lost_wednesday
+answer = golf_balls_left
+```
+]
+
+
+---
+
+Question: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
+Thought: Let's think step by step. Initially, there are 3 cars in the parking lot. When 2 more cars arrive, the total number of cars is 3 + 2 = 5.
+Action: Finish[
+```python
+cars_initial = 3
+cars_arrived = 2
+total_cars = cars_initial + cars_arrived
+answer = total_cars
+```
+]
+
+---
+
+Question: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?
+Thought: Let's think step by step. Olivia had $23 initially. She bought 5 bagels for $3 each. The total cost of the bagels is 5 * 3 = $15. So, the money she has left is 23 - 15 = $8.
+Action: Finish[
+```python
+money_initial = 23
+bagels = 5
+bagel_cost = 3
+money_spent = bagels * bagel_cost
+money_left = money_initial - money_spent
+answer = money_left
+```
+]
+
+---
+
+Question: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?
+Thought: Let's think step by step. Leah had 32 chocolates and her sister had 42 chocolates. Together they had 32 + 42 = 74 chocolates. They ate 35 chocolates, so the number of chocolates left is 74 - 35 = 39.
+Action: Finish[
+```python
+leah_chocolates = 32
+sister_chocolates = 42
+total_chocolates = leah_chocolates + sister_chocolates
+chocolates_eaten = 35
+chocolates_left = total_chocolates - chocolates_eaten
+answer = chocolates_left
+```
+]"""
+
+
 GSM8K_FEWSHOT_EXAMPLES_REACT = """Question: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?
 Thought 1: First, I need to find out how many lollipops Jason gave to Denny.
 Action 1: Calculate[
diff --git a/agential/cog/prompts/benchmark/svamp.py b/agential/cog/prompts/benchmark/svamp.py
index d0ab4279d..c84331455 100644
--- a/agential/cog/prompts/benchmark/svamp.py
+++ b/agential/cog/prompts/benchmark/svamp.py
@@ -59,6 +59,100 @@
 answer = total_ice_creams - num_ice_creams_bought_by_joseph"""
 
 
+SVAMP_FEWSHOT_EXAMPLES_COT = """Question: James bought 93 red and 10 blue stickers, he used 31 red stickers on his fridge and 7 blue stickers on his laptop. How many red stickers does James have?
+Thought: Let's think step by step. James initially bought 93 red stickers. He used 31 red stickers on his fridge. So, the number of red stickers left is 93 - 31 = 62.
+Action: Finish[
+```python
+original_red_stickers = 93
+used_red_stickers = 31
+red_stickers_left = original_red_stickers - used_red_stickers
+answer = red_stickers_left
+```
+]
+
+---
+
+Question: Allen went to the supermarket to buy eggs. Each egg costs 80 dollars, and there is a discount of 29 dollars. How much does Allen have to pay for each egg?
+Thought: Let's think step by step. The original cost of each egg is 80 dollars. There is a discount of 29 dollars. So, the amount Allen has to pay for each egg is 80 - 29 = 51 dollars.
+Action: Finish[
+```python
+original_egg_price_in_dollars = 80
+discount_dollars = 29
+price_after_discount = original_egg_price_in_dollars - discount_dollars
+answer = price_after_discount
+```
+]
+
+---
+
+Question: Dianna collects both cases and books. She bought 22 cases and 5 books from the store. Now she has 57 cases and 25 books. How many books did Dianna have at first?
+Thought: Let's think step by step. Dianna bought 5 books from the store, and now she has 25 books. So, the number of books she had at first is 25 - 5 = 20.
+Action: Finish[
+```python
+num_books_bought_at_store = 5
+num_books_now = 25
+initial_books = num_books_now - num_books_bought_at_store
+answer = initial_books
+```
+]
+
+---
+
+Question: There were 108 chickens and 20 sheep at the farm. Some of the chickens and sheep were sold. Now there are 87 chickens and 18 sheep left. How many chickens were sold?
+Thought: Let's think step by step. Initially, there were 108 chickens. Now there are 87 chickens. So, the number of chickens sold is 108 - 87 = 21.
+Action: Finish[
+```python
+num_chickens_before = 108
+num_chickens_now = 87
+chickens_sold = num_chickens_before - num_chickens_now
+answer = chickens_sold
+```
+]
+
+---
+
+Question: Katty scored 2 goals on Monday, 8 goals on Tuesday, and 9 goals on Wednesday. How many goals did Katty score on Monday and Wednesday?
+Thought: Let's think step by step. Katty scored 2 goals on Monday and 9 goals on Wednesday. So, the total number of goals scored on Monday and Wednesday is 2 + 9 = 11.
+Action: Finish[
+```python
+num_goals_on_monday = 2
+num_goals_on_wednesday = 9
+total_goals = num_goals_on_monday + num_goals_on_wednesday
+answer = total_goals
+```
+]
+
+---
+
+Question: There are 5 girls and 4 boys in the Masquerade. 12 more girls and 7 more boys joined. How many more girls than boys are in the Masquerade?
+Thought: Let's think step by step. Initially, there were 5 girls and 4 boys. Then 12 more girls and 7 more boys joined. The total number of girls is 5 + 12 = 17, and the total number of boys is 4 + 7 = 11. So, the difference is 17 - 11 = 6.
+Action: Finish[
+```python
+num_girls_before = 5
+num_girls_joined = 12
+num_boys_before = 4
+num_boys_joined = 7
+total_girls = num_girls_before + num_girls_joined
+total_boys = num_boys_before + num_boys_joined
+difference = total_girls - total_boys
+answer = difference
+```
+]
+
+---
+
+Question: Joseph and Getty went to buy ice creams. Together, they bought 36 ice creams. On the way back, Joseph ate 12 of the ice creams, and he has 2 ice creams left now. How many ice creams did Getty purchase?
+Thought: Let's think step by step. Joseph ate 12 ice creams and has 2 left, so he initially had 12 + 2 = 14 ice creams. Together, they bought 36 ice creams, so the number of ice creams Getty purchased is 36 - 14 = 22.
+Action: Finish[
+```python
+num_ice_creams_bought_by_joseph = 2 + 12
+total_ice_creams = 36
+ice_creams_bought_by_getty = total_ice_creams - num_ice_creams_bought_by_joseph
+answer = ice_creams_bought_by_getty
+```
+]"""
+
+
 SVAMP_FEWSHOT_EXAMPLES_REACT = """Question: James bought 93 red and 10 blue stickers, he used 31 red stickers on his fridge and 7 blue stickers on his laptop. How many red stickers does James have?
 Thought 1: First, I need to find out how many red stickers James has left after using some on his fridge.
 Action 1: Calculate[
diff --git a/agential/cog/prompts/benchmark/tabmwp.py b/agential/cog/prompts/benchmark/tabmwp.py
index 2345cb265..ba3b89157 100644
--- a/agential/cog/prompts/benchmark/tabmwp.py
+++ b/agential/cog/prompts/benchmark/tabmwp.py
@@ -77,6 +77,101 @@
     answer = "no\""""
 
 
+TABMWP_FEWSHOT_EXAMPLES_COT = """Read the following table regarding "Coin collections" and then write Python code to answer a question:
+
+Name | Number of coins
+Braden | 76
+Camilla | 94
+Rick | 86
+Mary | 84
+Hector | 80
+Devin | 83
+Emily | 82
+Avery | 87
+
+Question: Some friends discussed the sizes of their coin collections. What is the mean of the numbers?
+Thought: Let's think step by step. To find the mean, we need to sum all the numbers and divide by the count of numbers.
+Action: Finish[
+```python
+number_of_coins_for_different_person = [76, 94, 86, 84, 80, 83, 82, 87]
+answer = sum(number_of_coins_for_different_person) / len(number_of_coins_for_different_person)
+```
+]
+
+---
+
+Read the following table regarding "Price and Quantity" and then write Python code to answer a question:
+
+Price | Quantity demanded | Quantity supplied
+$155 | 22,600 | 5,800
+$275 | 20,500 | 9,400
+$395 | 18,400 | 13,000
+$515 | 16,300 | 16,600
+$635 | 14,200 | 20,200
+
+Question: At a price of $155, is there a shortage or a surplus? Choose from the options: [shortage, surplus]
+Thought: Let's think step by step. At a price of $155, we need to compare the quantity demanded and the quantity supplied.
+Action: Finish[
+```python
+quantity_demanded_price_155 = 22600
+quantity_supplied_price_155 = 5800
+if quantity_demanded_price_155 > quantity_supplied_price_155:
+    answer = 'shortage'
+else:
+    answer = 'surplus'
+```
+]
+
+---
+
+Read the following table regarding "Cans of food collected" and then write Python code to answer a question:
+
+Name | Number of cans
+Samir | 7
+Kristen | 4
+Dakota | 7
+Jamie | 8
+Maggie | 9
+
+Question: Samir's class recorded how many cans of food each student collected for their canned food drive. What is the median of the numbers?
+Thought: Let's think step by step. To find the median, we need to sort the numbers and find the middle value.
+Action: Finish[
+```python
+cans = [7, 4, 7, 8, 9]
+cans = sorted(cans)
+middle1 = (len(cans) - 1) // 2
+middle2 = len(cans) // 2
+answer = (cans[middle1] + cans[middle2]) / 2
+```
+]
+
+---
+
+Read the following table regarding "Toy prices" and then write Python code to answer a question:
+
+Item | Price
+toy boat | $5.54
+toy guitar | $8.23
+set of juggling balls | $5.01
+trivia game | $8.18
+jigsaw puzzle | $5.30
+toy dinosaur | $3.00
+
+Question: Lorenzo has $13.50. Does he have enough to buy a toy guitar and a set of juggling balls? Choose from the options: ['yes', 'no']
+Thought: Let's think step by step. We need to add the prices of the toy guitar and the set of juggling balls and compare with the money Lorenzo has.
+Action: Finish[
+```python
+guitar_price = 8.23
+juggling_balls_price = 5.01
+total_money = 13.5
+if total_money >= guitar_price + juggling_balls_price:
+    answer = "yes"
+else:
+    answer = "no"
+```
+]"""
+
+
 TABMWP_FEWSHOT_EXAMPLES_REACT = """Read the following table regarding "Coin collections" and then write Python code to answer a question:
 
 Name | Number of coins
diff --git a/agential/cog/prompts/manager.py b/agential/cog/prompts/manager.py
index e1ddf2a05..28ef65e11 100644
--- a/agential/cog/prompts/manager.py
+++ b/agential/cog/prompts/manager.py
@@ -13,6 +13,7 @@
     FEVER_FEWSHOT_EXAMPLES_REACT,
 )
 from agential.cog.prompts.benchmark.gsm8k import (
+    GSM8K_FEWSHOT_EXAMPLES_COT,
     GSM8K_FEWSHOT_EXAMPLES_POT,
     GSM8K_FEWSHOT_EXAMPLES_REACT,
 )
@@ -30,10 +31,12 @@
     MBPP_FEWSHOT_EXAMPLES_REACT,
 )
 from agential.cog.prompts.benchmark.svamp import (
+    SVAMP_FEWSHOT_EXAMPLES_COT,
     SVAMP_FEWSHOT_EXAMPLES_POT,
     SVAMP_FEWSHOT_EXAMPLES_REACT,
 )
 from agential.cog.prompts.benchmark.tabmwp import (
+    TABMWP_FEWSHOT_EXAMPLES_COT,
     TABMWP_FEWSHOT_EXAMPLES_POT,
     TABMWP_FEWSHOT_EXAMPLES_REACT,
 )
@@ -109,14 +112,17 @@ class FewShotType:
         Benchmarks.math.GSM8K: {
             FewShotType.POT: GSM8K_FEWSHOT_EXAMPLES_POT,
             FewShotType.REACT: GSM8K_FEWSHOT_EXAMPLES_REACT,
+            FewShotType.COT: GSM8K_FEWSHOT_EXAMPLES_COT,
         },
         Benchmarks.math.SVAMP: {
             FewShotType.POT: SVAMP_FEWSHOT_EXAMPLES_POT,
             FewShotType.REACT: SVAMP_FEWSHOT_EXAMPLES_REACT,
+            FewShotType.COT: SVAMP_FEWSHOT_EXAMPLES_COT,
         },
         Benchmarks.math.TABMWP: {
             FewShotType.POT: TABMWP_FEWSHOT_EXAMPLES_POT,
             FewShotType.REACT: TABMWP_FEWSHOT_EXAMPLES_REACT,
+            FewShotType.COT: TABMWP_FEWSHOT_EXAMPLES_COT,
         },
     },
     Benchmarks.CODE: {
@@ -142,9 +148,9 @@ def get_fewshot_examples(mode: Dict[str, str], fewshot_type: str) -> str:
             - triviaqa: Supports "cot", "direct", "react"
             - ambignq: Supports "cot", "direct", "react"
         - math:
-            - gsm8k: Supports "pot", "react"
-            - svamp: Supports "pot", "react"
-            - tabmwp: Supports "pot", "react"
+            - gsm8k: Supports "pot", "cot", "react"
+            - svamp: Supports "pot", "cot", "react"
+            - tabmwp: Supports "pot", "cot", "react"
         - code:
             - humaneval: Supports "pot", "react"
             - mbpp: Supports "pot", "react"
diff --git a/agential/cog/strategies/critic/base.py b/agential/cog/strategies/critic/base.py
index 7e764b53d..60d5d7f93 100644
--- a/agential/cog/strategies/critic/base.py
+++ b/agential/cog/strategies/critic/base.py
@@ -50,17 +50,17 @@ def generate_critique(
 
     @abstractmethod
     def create_output_dict(
-        self, answer: str, critique: str, external_tool_info: Dict[str, str]
+        self, answer: str, critique: str, external_tool_info: Dict[str, Any]
     ) -> Dict[str, Any]:
         """Creates a dictionary containing the answer and critique, along with any additional key updates.
 
         Args:
             answer (str): The original answer.
             critique (str): The generated critique.
-            external_tool_info (Dict[str, str]): Information from any external tools used during the critique.
+            external_tool_info (Dict[str, Any]): Information from any external tools used during the critique.
 
         Returns:
-            Dict[str, Any]: A dictionary containing the answer, critique, and additional key updates.
+            Dict[str, Any]: The output dictionary with the answer, critique, and external tool info.
         """
         pass
 
diff --git a/agential/cog/strategies/critic/code.py b/agential/cog/strategies/critic/code.py
index e55bff293..a97513da8 100644
--- a/agential/cog/strategies/critic/code.py
+++ b/agential/cog/strategies/critic/code.py
@@ -95,7 +95,7 @@ def generate_critique(
         Returns:
             Tuple[str, Dict[str, Any]]: The generated critique and external tool information.
         """
-        external_tool_info = {}
+        external_tool_info = {"execution_status": ""}
         if use_tool:
             if "tests" not in additional_keys:
                 raise ValueError(
@@ -113,7 +113,7 @@ def generate_critique(
             validate_overlapping_keys(additional_keys, external_tool_info)
 
         additional_keys = additional_keys.copy()
-        additional_keys.update(external_tool_info)
+        additional_keys.update(external_tool_info if use_tool else {})
 
         new_critique = _prompt_critique(
             llm=self.llm,
@@ -128,19 +128,23 @@ def generate_critique(
         return new_critique, external_tool_info
 
     def create_output_dict(
-        self, answer: str, critique: str, external_tool_info: Dict[str, str]
-    ) -> Dict[str, str]:
+        self, answer: str, critique: str, external_tool_info: Dict[str, Any]
+    ) -> Dict[str, Any]:
         """Creates an output dictionary containing the answer, critique, and external tool information.
 
         Args:
             answer (str): The generated answer.
             critique (str): The generated critique.
-            external_tool_info (Dict[str, str]): Information from external tool execution.
+            external_tool_info (Dict[str, Any]): Information from external tool execution.
 
         Returns:
-            Dict[str, str]: The output dictionary.
+            Dict[str, Any]: The output dictionary with the answer, critique, and external tool info.
         """
-        output_dict = {"code": answer, "critique": critique, **external_tool_info}
+        output_dict = {
+            "answer": answer,
+            "critique": critique,
+            "external_tool_info": external_tool_info,
+        }
         return output_dict
 
     def update_answer_based_on_critique(
diff --git a/agential/cog/strategies/critic/math.py b/agential/cog/strategies/critic/math.py
index 02f25cbbb..93f8d0362 100644
--- a/agential/cog/strategies/critic/math.py
+++ b/agential/cog/strategies/critic/math.py
@@ -101,7 +101,8 @@ def generate_critique(
         Returns:
             Tuple[str, Dict[str, Any]]: The generated critique and external tool information.
         """
-        external_tool_info = {}
+        external_tool_info = {"execution_status": "", "code_answer": ""}
+
         if use_tool:
             code_answer, execution_status = safe_execute(answer)
             external_tool_info = {
@@ -136,7 +137,7 @@ def generate_critique(
             validate_overlapping_keys(additional_keys, external_tool_info)
 
         additional_keys = additional_keys.copy()
-        additional_keys.update(external_tool_info)
+        additional_keys.update(external_tool_info if use_tool else {})
 
         new_critique = _prompt_critique(
             llm=self.llm,
@@ -151,19 +152,23 @@ def generate_critique(
         return new_critique, external_tool_info
 
     def create_output_dict(
-        self, answer: str, critique: str, external_tool_info: Dict[str, str]
-    ) -> Dict[str, str]:
+        self, answer: str, critique: str, external_tool_info: Dict[str, Any]
+    ) -> Dict[str, Any]:
         """Creates an output dictionary containing the answer, critique, and external tool information.
 
         Args:
             answer (str): The generated answer.
             critique (str): The generated critique.
-            external_tool_info (Dict[str, str]): Information from external tool execution.
+            external_tool_info (Dict[str, Any]): Information from external tool execution.
 
         Returns:
-            Dict[str, str]: The output dictionary.
+            Dict[str, Any]: The output dictionary with the answer, critique, and external tool info.
         """
-        output_dict = {"code": answer, "critique": critique, **external_tool_info}
+        output_dict = {
+            "answer": answer,
+            "critique": critique,
+            "external_tool_info": external_tool_info,
+        }
         return output_dict
 
     def update_answer_based_on_critique(
diff --git a/agential/cog/strategies/critic/qa.py b/agential/cog/strategies/critic/qa.py
index 1dd17ce45..398f19cff 100644
--- a/agential/cog/strategies/critic/qa.py
+++ b/agential/cog/strategies/critic/qa.py
@@ -106,7 +106,8 @@ def generate_critique(
         Returns:
             Tuple[str, Dict[str, Any]]: The generated critique and any external tool information.
         """
-        external_tool_info = {}
+        external_tool_info = {"search_query": "", "search_result": ""}
+
         new_critique = _prompt_critique(
             llm=self.llm,
             question=question,
@@ -161,7 +162,7 @@ def generate_critique(
         return new_critique, external_tool_info
 
     def create_output_dict(
-        self, answer: str, critique: str, external_tool_info: Dict[str, str]
+        self, answer: str, critique: str, external_tool_info: Dict[str, Any]
     ) -> Dict[str, Any]:
         """Creates a dictionary containing the answer and critique, along with any additional key updates.
 
@@ -172,7 +173,7 @@ def create_output_dict(
         Args:
             answer (str): The original answer.
             critique (str): The generated critique.
-            external_tool_info (Dict[str, str]): Information from any external tools used during the critique.
+            external_tool_info (Dict[str, Any]): Information from any external tools used during the critique.
 
         Returns:
             Dict[str, Any]: A dictionary containing the answer, critique, and additional key updates.
@@ -180,7 +181,7 @@ def create_output_dict(
         output_dict = {
             "answer": answer if not self._halt else critique,
             "critique": critique,
-            **external_tool_info,
+            "external_tool_info": external_tool_info,
         }
         return output_dict
 
diff --git a/agential/cog/strategies/react/base.py b/agential/cog/strategies/react/base.py
index d976f3765..8ede1cada 100644
--- a/agential/cog/strategies/react/base.py
+++ b/agential/cog/strategies/react/base.py
@@ -1,7 +1,7 @@
 """Base ReAct Agent strategy class."""
 
 from abc import abstractmethod
-from typing import Dict, Tuple
+from typing import Any, Dict, Tuple
 
 from langchain_core.language_models.chat_models import BaseChatModel
 
@@ -41,7 +41,9 @@ def generate_action(
         pass
 
     @abstractmethod
-    def generate_observation(self, idx: int, action_type: str, query: str) -> str:
+    def generate_observation(
+        self, idx: int, action_type: str, query: str
+    ) -> Tuple[str, Dict[str, Any]]:
         """Generates an observation based on the action type and query.
 
         Args:
@@ -50,13 +52,18 @@ def generate_observation(self, idx: int, action_type: str, query: str) -> str:
             query (str): The query for the action.
 
         Returns:
-            str: The generated observation.
+            Tuple[str, Dict[str, Any]]: The generated observation and external tool outputs.
         """
         pass
 
     @abstractmethod
     def create_output_dict(
-        self, thought: str, action_type: str, query: str, obs: str
+        self,
+        thought: str,
+        action_type: str,
+        query: str,
+        obs: str,
+        external_tool_info: Dict[str, Any],
     ) -> Dict[str, str]:
         """Creates a dictionary of the output components.
 
@@ -65,9 +72,10 @@ def create_output_dict(
             action_type (str): The type of action performed.
             query (str): The query for the action.
             obs (str): The generated observation.
+            external_tool_info (Dict[str, Any]): The external tool outputs.
 
         Returns:
-            Dict[str, str]: A dictionary containing the thought, action type, query, and observation.
+            Dict[str, Any]: A dictionary containing the thought, action type, query, observation, answer, and external tool output.
         """
         pass
 
diff --git a/agential/cog/strategies/react/code.py b/agential/cog/strategies/react/code.py
index 13eb54d8d..41f01beba 100644
--- a/agential/cog/strategies/react/code.py
+++ b/agential/cog/strategies/react/code.py
@@ -56,7 +56,7 @@ def __init__(
         self,
         llm: BaseChatModel,
         max_steps: int = 6,
-        max_tokens: int = 3896,
+        max_tokens: int = 5000,
         enc: Encoding = tiktoken.encoding_for_model("gpt-3.5-turbo"),
     ) -> None:
         """Initialization."""
@@ -66,7 +66,7 @@ def __init__(
         self.enc = enc
 
         self._scratchpad = ""
-        self._current_answer = ""
+        self._answer = ""
         self._finished = False
 
     def generate(
@@ -148,7 +148,9 @@ def generate_action(
 
         return action_type, query
 
-    def generate_observation(self, idx: int, action_type: str, query: str) -> str:
+    def generate_observation(
+        self, idx: int, action_type: str, query: str
+    ) -> Tuple[str, Dict[str, Any]]:
         """Generates an observation based on the action type and query.
 
         Args:
@@ -157,30 +159,44 @@ def generate_observation(self, idx: int, action_type: str, query: str) -> str:
             query (str): The query for the action.
 
         Returns:
-            str: The generated observation.
+            Tuple[str, Dict[str, Any]]: The generated observation and external tool outputs.
         """
+        external_tool_info = {"execution_status": ""}
+
         self._scratchpad += f"\nObservation {idx}: "
         if action_type.lower() == "finish":
-            self._current_answer = query
+            _, execution_status = safe_execute(query)
+            external_tool_info["execution_status"] = execution_status
+
+            self._answer = query
             self._finished = True
-            obs = f"\n```python\n{self._current_answer}\n```"
+            obs = f"\n```python\n{self._answer}\n```"
         elif action_type.lower() == "implement":
             _, execution_status = safe_execute(query)
-            self._current_answer = query
-            obs = f"\n```python\n{self._current_answer}\n```\nExecution Status: {execution_status}"
+            external_tool_info["execution_status"] = execution_status
+
+            self._answer = query
+            obs = f"\n```python\n{self._answer}\n```\nExecution Status: {execution_status}"
         elif action_type.lower() == "test":
-            obs = f"{self._current_answer}\n\n{query}"
+            obs = f"{self._answer}\n\n{query}"
             _, execution_status = safe_execute(obs)
+            external_tool_info["execution_status"] = execution_status
+
             obs = f"\n```python\n{obs}\n```\nExecution Status: {execution_status}"
         else:
             obs = "Invalid Action. Valid Actions are Implement[code] Test[code] and Finish[answer]."
         self._scratchpad += obs
 
-        return obs
+        return obs, external_tool_info
 
     def create_output_dict(
-        self, thought: str, action_type: str, query: str, obs: str
-    ) -> Dict[str, str]:
+        self,
+        thought: str,
+        action_type: str,
+        query: str,
+        obs: str,
+        external_tool_info: Dict[str, Any],
+    ) -> Dict[str, Any]:
         """Creates a dictionary of the output components.
 
         Args:
@@ -188,16 +204,18 @@ def create_output_dict(
             action_type (str): The type of action performed.
             query (str): The query for the action.
             obs (str): The generated observation.
+            external_tool_info (Dict[str, Any]): The external tool outputs.
 
         Returns:
-            Dict[str, str]: A dictionary containing the thought, action type, query, observation, and answer.
+            Dict[str, Any]: A dictionary containing the thought, action type, query, observation, answer, and external tool output.
         """
         return {
             "thought": thought,
             "action_type": action_type,
             "query": query,
             "observation": obs,
-            "answer": self._current_answer,
+            "answer": self._answer,
+            "external_tool_info": external_tool_info,
         }
 
     def halting_condition(
@@ -248,7 +266,7 @@ def reset(self, **kwargs: Any) -> None:
         Returns:
             None
         """
-        self._current_answer = ""
+        self._answer = ""
         self._scratchpad = ""
         self._finished = False
 
diff --git a/agential/cog/strategies/react/math.py b/agential/cog/strategies/react/math.py
index f9a2ab98d..9ab0c150c 100644
--- a/agential/cog/strategies/react/math.py
+++ b/agential/cog/strategies/react/math.py
@@ -56,7 +56,7 @@ def __init__(
         self,
         llm: BaseChatModel,
         max_steps: int = 6,
-        max_tokens: int = 3896,
+        max_tokens: int = 5000,
         enc: Encoding = tiktoken.encoding_for_model("gpt-3.5-turbo"),
     ) -> None:
         """Initialization."""
@@ -66,7 +66,7 @@ def __init__(
         self.enc = enc
 
         self._scratchpad = ""
-        self._current_answer = ""
+        self._answer = ""
         self._finished = False
 
     def generate(
@@ -148,7 +148,9 @@ def generate_action(
 
         return action_type, query
 
-    def generate_observation(self, idx: int, action_type: str, query: str) -> str:
+    def generate_observation(
+        self, idx: int, action_type: str, query: str
+    ) -> Tuple[str, Dict[str, Any]]:
         """Generates an observation based on the action type and query.
 
         Args:
@@ -157,28 +159,42 @@ def generate_observation(self, idx: int, action_type: str, query: str) -> str:
             query (str): The query for the action.
 
         Returns:
-            str: The generated observation.
+            Tuple[str, Dict[str, Any]]: The generated observation and external tool outputs.
         """
+        external_tool_info = {"execution_status": "", "code_answer": ""}
+
         self._scratchpad += f"\nObservation {idx}: "
         if action_type.lower() == "finish":
-            self._current_answer = query
+            code_answer, execution_status = safe_execute(query)
+            external_tool_info["code_answer"] = code_answer[0]
+            external_tool_info["execution_status"] = execution_status
+
+            self._answer = query
             self._finished = True
-            obs = f"\n```python\n{self._current_answer}\n```"
+            obs = f"\n```python\n{self._answer}\n```"
         elif action_type.lower() == "calculate":
-            answer, execution_status = safe_execute(query)
-            self._current_answer = query
-            obs = f"\n```python\n{self._current_answer}\n```\nExecution Status: {execution_status}\nOutput: answer = {answer[0]}"
+            code_answer, execution_status = safe_execute(query)
+            external_tool_info["code_answer"] = code_answer[0]
+            external_tool_info["execution_status"] = execution_status
+
+            self._answer = query
+            obs = f"\n```python\n{self._answer}\n```\nExecution Status: {execution_status}\nOutput: answer = {code_answer[0]}"
         else:
             obs = (
                 "Invalid Action. Valid Actions are Calculate[code] and Finish[answer]."
             )
         self._scratchpad += obs
 
-        return obs
+        return obs, external_tool_info
 
     def create_output_dict(
-        self, thought: str, action_type: str, query: str, obs: str
-    ) -> Dict[str, str]:
+        self,
+        thought: str,
+        action_type: str,
+        query: str,
+        obs: str,
+        external_tool_info: Dict[str, Any],
+    ) -> Dict[str, Any]:
         """Creates a dictionary of the output components.
 
         Args:
@@ -186,16 +202,18 @@ def create_output_dict(
             action_type (str): The type of action performed.
             query (str): The query for the action.
             obs (str): The generated observation.
+            external_tool_info (Dict[str, Any]): The external tool outputs.
 
         Returns:
-            Dict[str, str]: A dictionary containing the thought, action type, query, observation, and answer.
+            Dict[str, Any]: A dictionary containing the thought, action type, query, observation, answer, and external tool output.
         """
         return {
             "thought": thought,
             "action_type": action_type,
             "query": query,
             "observation": obs,
-            "answer": self._current_answer,
+            "answer": self._answer,
+            "external_tool_info": external_tool_info,
         }
 
     def halting_condition(
@@ -246,7 +264,7 @@ def reset(self, **kwargs: Any) -> None:
         Returns:
             None
         """
-        self._current_answer = ""
+        self._answer = ""
         self._scratchpad = ""
         self._finished = False
 
diff --git a/agential/cog/strategies/react/qa.py b/agential/cog/strategies/react/qa.py
index b031bd503..fcbba8084 100644
--- a/agential/cog/strategies/react/qa.py
+++ b/agential/cog/strategies/react/qa.py
@@ -54,7 +54,7 @@ def __init__(
         self,
         llm: BaseChatModel,
         max_steps: int = 6,
-        max_tokens: int = 3896,
+        max_tokens: int = 5000,
         docstore: DocstoreExplorer = DocstoreExplorer(Wikipedia()),
         enc: Encoding = tiktoken.encoding_for_model("gpt-3.5-turbo"),
     ) -> None:
@@ -66,6 +66,7 @@ def __init__(
         self.enc = enc
 
         self._scratchpad = ""
+        self._answer = ""
         self._finished = False
 
     def generate(
@@ -142,7 +143,9 @@ def generate_action(
 
         return action_type, query
 
-    def generate_observation(self, idx: int, action_type: str, query: str) -> str:
+    def generate_observation(
+        self, idx: int, action_type: str, query: str
+    ) -> Tuple[str, Dict[str, Any]]:
         """Generates an observation based on the action type and query.
 
         Args:
@@ -151,8 +154,10 @@ def generate_observation(self, idx: int, action_type: str, query: str) -> str:
             query (str): The query for the action.
 
         Returns:
-            str: The generated observation.
+            Tuple[str, Dict[str, Any]]: The generated observation and external tool outputs.
         """
+        external_tool_info = {"search_result": "", "lookup_result": ""}
+
         self._scratchpad += f"\nObservation {idx}: "
         if action_type.lower() == "finish":
             self._answer = query
@@ -160,23 +165,33 @@ def generate_observation(self, idx: int, action_type: str, query: str) -> str:
             obs = query
         elif action_type.lower() == "search":
             try:
-                obs = remove_newline(self.docstore.search(query))
+                search_result = self.docstore.search(query)
+                external_tool_info["search_result"] = search_result
+                obs = remove_newline(search_result)
             except Exception:
                 obs = "Could not find that page, please try again."
         elif action_type.lower() == "lookup":
             try:
-                obs = remove_newline(self.docstore.lookup(query))
+                lookup_result = self.docstore.lookup(query)
+                external_tool_info["lookup_result"] = lookup_result
+                obs = remove_newline(lookup_result)
+
             except ValueError:
                 obs = "The last page Searched was not found, so you cannot Lookup a keyword in it. Please try one of the similar pages given."
         else:
             obs = "Invalid Action. Valid Actions are Lookup[<topic>] Search[<topic>] and Finish[<answer>]."
         self._scratchpad += obs
 
-        return obs
+        return obs, external_tool_info
 
     def create_output_dict(
-        self, thought: str, action_type: str, query: str, obs: str
-    ) -> Dict[str, str]:
+        self,
+        thought: str,
+        action_type: str,
+        query: str,
+        obs: str,
+        external_tool_info: Dict[str, Any],
+    ) -> Dict[str, Any]:
         """Creates a dictionary of the output components.
 
         Args:
@@ -184,15 +199,18 @@ def create_output_dict(
             action_type (str): The type of action performed.
             query (str): The query for the action.
             obs (str): The generated observation.
+            external_tool_info (Dict[str, Any]): The external tool outputs.
 
         Returns:
-            Dict[str, str]: A dictionary containing the thought, action type, query, and observation.
+            Dict[str, Any]: A dictionary containing the thought, action type, query, observation, answer, and external tool output.
         """
         return {
             "thought": thought,
             "action_type": action_type,
             "query": query,
             "observation": obs,
+            "answer": self._answer,
+            "external_tool_info": external_tool_info,
         }
 
     def halting_condition(
diff --git a/agential/cog/strategies/reflexion/base.py b/agential/cog/strategies/reflexion/base.py
index 0978d2c2c..19c06a9b1 100644
--- a/agential/cog/strategies/reflexion/base.py
+++ b/agential/cog/strategies/reflexion/base.py
@@ -54,7 +54,7 @@ def generate_observation(
             key (str): The key for the observation.
 
         Returns:
-            Tuple[bool, str]: The generated observation.
+            Tuple[bool, str]: A boolean indicating correctness and the generated observation.
         """
         pass
 
@@ -174,7 +174,7 @@ def generate_action(
     @abstractmethod
     def generate_observation(
         self, step_idx: int, action_type: str, query: str, key: str
-    ) -> Tuple[bool, str]:
+    ) -> Tuple[bool, str, Dict[str, Any]]:
         """Generates an observation based on the action type and query.
 
         Args:
@@ -184,7 +184,8 @@ def generate_observation(
             key (str): The key for the observation.
 
         Returns:
-            str: The generated observation.
+            Tuple[bool, str, Dict[str, Any]]: A tuple containing a boolean indicating whether the answer is correct, a string representing the observation,
+                and a dictionary of the external tool outputs.
         """
         pass
 
@@ -205,8 +206,14 @@ def create_output_dict(
 
     @abstractmethod
     def react_create_output_dict(
-        self, thought: str, action_type: str, query: str, obs: str, is_correct: bool
-    ) -> Dict[str, str]:
+        self,
+        thought: str,
+        action_type: str,
+        query: str,
+        obs: str,
+        external_tool_info: Dict[str, Any],
+        is_correct: bool,
+    ) -> Dict[str, Any]:
         """Creates a dictionary of the output components.
 
         Args:
@@ -214,10 +221,11 @@ def react_create_output_dict(
             action_type (str): The type of action performed.
             query (str): The query for the action.
             obs (str): The generated observation.
+            external_tool_info (Dict[str, Any]): The external tool outputs.
             is_correct (bool): Whether the observation is correct.
 
         Returns:
-            Dict[str, str]: A dictionary containing the thought, action type, observation, answer, and is_correct.
+            Dict[str, Any]: A dictionary containing the thought, action type, observation, answer, external_tool_info, and is_correct.
         """
         pass
 
diff --git a/agential/cog/strategies/reflexion/math.py b/agential/cog/strategies/reflexion/math.py
new file mode 100644
index 000000000..22c03c128
--- /dev/null
+++ b/agential/cog/strategies/reflexion/math.py
@@ -0,0 +1,740 @@
+"""Reflexion Agent strategies for Math."""
+
+import re
+
+from typing import Any, Dict, List, Optional, Tuple
+
+import tiktoken
+
+from langchain_core.language_models.chat_models import BaseChatModel
+from tiktoken.core import Encoding
+
+from agential.cog.eval.reflexion import EM
+from agential.cog.functional.reflexion import (
+    _is_halted,
+    _prompt_cot_agent,
+    _prompt_react_agent,
+    _truncate_scratchpad,
+)
+from agential.cog.modules.reflect.reflexion import (
+    ReflexionCoTReflector,
+    ReflexionReActReflector,
+)
+from agential.cog.strategies.reflexion.base import (
+    ReflexionCoTBaseStrategy,
+    ReflexionReActBaseStrategy,
+)
+from agential.utils.general import safe_execute
+from agential.utils.parse import remove_newline
+
+
+def parse_math_action_cot(action: str) -> Tuple[str, str]:
+    """Parses an action string to extract the action type and code content.
+
+    Identifies action types (`Finish`) and extracts the
+    corresponding code content enclosed within Markdown-style code blocks.
+    The action type is case-insensitive and the code content is trimmed of
+    leading and trailing whitespace.
+
+    Args:
+        action (str): The action string containing the action type and code content.
+
+    Returns:
+        Tuple[str, str]: A tuple containing the extracted action type (capitalized)
+        and the extracted code content.
+    """
+    action_split = action.split("```python", maxsplit=1)
+    match = re.search(r"\b(Finish)\b", action_split[0], re.IGNORECASE)
+
+    action_type = match.group(0).lower().capitalize() if match else ""
+    try:
+        query = action_split[1].split("```")[0].strip() if action_type else ""
+    except:
+        action_type = ""
+        query = ""
+
+    return action_type, query
+
+
+def parse_math_action_react(action: str) -> Tuple[str, str]:
+    """Parses an action string to extract the action type and code content.
+
+    Identifies action types (`Finish`, `Calculate`) and extracts the
+    corresponding code content enclosed within Markdown-style code blocks.
+    The action type is case-insensitive and the code content is trimmed of
+    leading and trailing whitespace.
+
+    Args:
+        action (str): The action string containing the action type and code content.
+
+    Returns:
+        Tuple[str, str]: A tuple containing the extracted action type (capitalized)
+        and the extracted code content.
+    """
+    action_split = action.split("```python", maxsplit=1)
+    match = re.search(r"\b(Finish|Calculate)\b", action_split[0], re.IGNORECASE)
+
+    action_type = match.group(0).lower().capitalize() if match else ""
+    try:
+        query = action_split[1].split("```")[0].strip() if action_type else ""
+    except:
+        action_type = ""
+        query = ""
+
+    return action_type, query
+
+
+class ReflexionCoTMathStrategy(ReflexionCoTBaseStrategy):
+    """A strategy class for Math benchmarks using the ReflexionCoT agent.
+
+    Attributes:
+        llm (BaseChatModel): The language model used for generating answers and critiques.
+        reflector (Optional[ReflexionCoTReflector]): The reflector used for generating reflections. Defaults to None.
+        max_reflections (int): The maximum number of reflections allowed. Defaults to 3.
+        max_trials (int): The maximum number of trials allowed. Defaults to 1.
+    """
+
+    def __init__(
+        self,
+        llm: BaseChatModel,
+        reflector: Optional[ReflexionCoTReflector] = None,
+        max_reflections: int = 3,
+        max_trials: int = 1,
+    ) -> None:
+        """Initialization."""
+        super().__init__(llm)
+        self.llm = llm
+        self.max_reflections = max_reflections
+        self.max_trials = max_trials
+
+        if not reflector:
+            reflector = ReflexionCoTReflector(llm=llm, max_reflections=max_reflections)
+        self.reflector = reflector
+
+        self._scratchpad = ""
+        self._finished = False
+        self._answer = ""
+
+    def generate(
+        self,
+        question: str,
+        examples: str,
+        reflections: str,
+        prompt: str,
+        additional_keys: Dict[str, str],
+        **kwargs: Any,
+    ) -> str:
+        """Generates a thought based on the question, examples, and prompt.
+
+        Args:
+            question (str): The question to be answered.
+            examples (str): Examples to guide the generation process.
+            reflections (str): Reflections to consider during generation.
+            prompt (str): The prompt used for generating the thought.
+            additional_keys (Dict[str, str]): Additional keys for the generation process.
+            **kwargs (Any): Additional arguments.
+
+        Returns:
+            str: The generated thought.
+        """
+        self._scratchpad += "\nThought:"
+        thought = _prompt_cot_agent(
+            llm=self.llm,
+            examples=examples,
+            reflections=reflections,
+            question=question,
+            scratchpad=self._scratchpad,
+            prompt=prompt,
+            additional_keys=additional_keys,
+        )
+        thought = remove_newline(thought).split("Action")[0].strip().split("\n")[0]
+        self._scratchpad += " " + thought
+
+        return thought
+
+    def generate_action(
+        self,
+        question: str,
+        examples: str,
+        reflections: str,
+        prompt: str,
+        additional_keys: Dict[str, str],
+        **kwargs: Any,
+    ) -> Tuple[str, str]:
+        """Generates an action based on the question, examples, and prompt.
+
+        Args:
+            question (str): The question to be answered.
+            examples (str): Examples to guide the generation process.
+            reflections (str): Reflections to consider during generation.
+            prompt (str): The prompt used for generating the action.
+            additional_keys (Dict[str, str]): Additional keys for the generation process.
+            **kwargs (Any): Additional arguments.
+
+        Returns:
+            Tuple[str, str]: The generated action type and query.
+        """
+        self._scratchpad += "\nAction:"
+        action = _prompt_cot_agent(
+            llm=self.llm,
+            examples=examples,
+            reflections=reflections,
+            question=question,
+            scratchpad=self._scratchpad,
+            prompt=prompt,
+            additional_keys=additional_keys,
+        )
+        action = action.split("Observation")[0].strip()
+
+        action_type, query = parse_math_action_cot(action)
+        self._scratchpad += f" {action_type}[\n```python\n{query}\n```\n]"
+
+        return action_type, query
+
+    def generate_observation(
+        self, action_type: str, query: str, key: str
+    ) -> Tuple[bool, str]:
+        """Generates an observation based on the action type and query.
+
+        Args:
+            action_type (str): The type of action to be performed.
+            query (str): The query for the action.
+            key (str): The key for the observation.
+
+        Returns:
+            Tuple[bool, str]: A boolean indicating correctness and the generated observation.
+        """
+        answer, _ = safe_execute(self._answer)
+
+        self._scratchpad += f"\nObservation: "
+        if action_type.lower() == "finish":
+            self._finished = True
+            self._answer = query
+            if EM(answer[0], key, normalize=False):
+                obs = "Answer is CORRECT"
+            else:
+                obs = "Answer is INCORRECT"
+        else:
+            obs = "Invalid action type, please try again."
+        self._scratchpad += obs
+
+        return EM(answer[0], key, normalize=False), obs
+
+    def create_output_dict(
+        self,
+        thought: str,
+        action_type: str,
+        obs: str,
+        is_correct: bool,
+        reflections: List[str],
+    ) -> Dict[str, Any]:
+        """Creates a dictionary of the output components.
+
+        Args:
+            thought (str): The generated thought.
+            action_type (str): The type of action performed.
+            obs (str): The generated observation.
+            is_correct (bool): Whether the answer is correct.
+            reflections (List[str]): The reflections.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing the thought, action type, observation, answer, is_correct, and reflections.
+        """
+        return {
+            "thought": thought,
+            "action_type": action_type,
+            "observation": obs,
+            "answer": self._answer,
+            "is_correct": is_correct,
+            "reflections": reflections,
+        }
+
+    def halting_condition(self, idx: int, key: str, **kwargs: Any) -> bool:
+        """Determines whether the halting condition has been met.
+
+        Args:
+            idx (int): The current step index.
+            key (str): The key for the observation.
+            **kwargs (Any): Additional arguments.
+
+        Returns:
+            bool: True if the halting condition is met, False otherwise.
+        """
+        max_trials = kwargs.get("max_trials", self.max_trials)
+        answer, _ = safe_execute(self._answer)
+        return EM(answer[0], key, normalize=False) or idx >= max_trials
+
+    def reset(self, **kwargs: Any) -> None:
+        """Resets the internal state of the strategy.
+
+        Resets the scratchpad and the finished flag.
+        Resets only the scratchpad if specified with 'only_scratchpad'.
+
+        Args:
+            **kwargs (Any): Additional arguments.
+        """
+        only_scratchpad = kwargs.get("only_scratchpad", False)
+        if only_scratchpad:
+            self._scratchpad = ""
+        else:
+            self.reflector.reset()
+            self._scratchpad = ""
+            self._finished = False
+            self._answer = ""
+
+    def reflect(
+        self,
+        reflect_strategy: str,
+        question: str,
+        examples: str,
+        prompt: str,
+        additional_keys: Dict[str, str],
+    ) -> Tuple[List[str], str]:
+        """Reflects on a given question, context, examples, prompt, and additional keys using the specified reflection strategy.
+
+        Args:
+            reflect_strategy (str): The strategy to use for reflection.
+            question (str): The question to be reflected upon.
+            examples (str): Examples to guide the reflection process.
+            prompt (str): The prompt or instruction to guide the reflection.
+            additional_keys (Dict[str, str]): Additional keys for the reflection process.
+
+        Returns:
+            Tuple[List[str], str]: The reflections and the reflection string.
+        """
+        reflections, reflections_str = self.reflector.reflect(
+            reflect_strategy=reflect_strategy,
+            question=question,
+            examples=examples,
+            scratchpad=self._scratchpad,
+            prompt=prompt,
+            additional_keys=additional_keys,
+        )
+        return reflections, reflections_str
+
+    def reflect_condition(
+        self, idx: int, reflect_strategy: Optional[str], key: str
+    ) -> bool:
+        """Determines whether the reflection condition has been met.
+
+        Args:
+            idx (int): The current step.
+            reflect_strategy (Optional[str]): The strategy to use for reflection.
+            key (str): The key for the observation.
+
+        Returns:
+            bool: True if the reflection condition is met, False otherwise.
+        """
+        answer, _ = safe_execute(self._answer)
+        return (
+            idx > 0
+            and not EM(answer[0], key, normalize=False)
+            and reflect_strategy is not None
+        )
+
+
+class ReflexionReActMathStrategy(ReflexionReActBaseStrategy):
+    """A strategy class for Math benchmarks using the ReflexionReAct agent.
+
+    Attributes:
+        llm (BaseChatModel): The language model used for generating answers and critiques.
+        reflector (Optional[ReflexionReActReflector]): The reflector used for generating reflections. Defaults to None.
+        max_reflections (int): The maximum number of reflections allowed. Defaults to 3.
+        max_trials (int): The maximum number of trials allowed. Defaults to 1.
+        max_steps (int): The maximum number of steps allowed. Defaults to 6.
+        max_tokens (int): The maximum number of tokens allowed. Defaults to 5000.
+        enc (Encoding): The encoding for tokenization. Defaults to gpt-3.5-turbo.
+    """
+
+    def __init__(
+        self,
+        llm: BaseChatModel,
+        reflector: Optional[ReflexionReActReflector] = None,
+        max_reflections: int = 3,
+        max_trials: int = 1,
+        max_steps: int = 6,
+        max_tokens: int = 5000,
+        enc: Encoding = tiktoken.encoding_for_model("gpt-3.5-turbo"),
+    ) -> None:
+        """Initialization."""
+        super().__init__(llm)
+        self.max_reflections = max_reflections
+        self.max_trials = max_trials
+        self.max_steps = max_steps
+        self.max_tokens = max_tokens
+        self.enc = enc
+
+        if not reflector:
+            reflector = ReflexionReActReflector(
+                llm=llm, max_reflections=max_reflections
+            )
+        self.reflector = reflector
+
+        self._finished = False
+        self._answer = ""
+        self._scratchpad = ""
+
+    def generate(
+        self,
+        question: str,
+        examples: str,
+        reflections: str,
+        prompt: str,
+        additional_keys: Dict[str, str],
+        **kwargs: Any,
+    ) -> str:
+        """Generates a thought based on the given question, examples, reflections, prompt, and additional keys.
+
+        Args:
+            question (str): The question to generate a thought for.
+            examples (str): Examples to guide the thought generation process.
+            reflections (str): Reflections to consider during the thought generation process.
+            prompt (str): The prompt or instruction to guide the thought generation.
+            additional_keys (Dict[str, str]): Additional keys for the thought generation process.
+            kwargs (Dict[str, Any]): Additional keyword arguments.
+
+        Returns:
+            str: The generated thought.
+        """
+        max_steps = kwargs.get("max_steps", self.max_steps)  # type: ignore
+
+        self._scratchpad += "\nThought:"
+        thought = _prompt_react_agent(
+            llm=self.llm,
+            question=question,
+            examples=examples,
+            reflections=reflections,
+            scratchpad=self._scratchpad,
+            max_steps=max_steps,  # type: ignore
+            prompt=prompt,
+            additional_keys=additional_keys,
+        )
+        thought = remove_newline(thought).split("Action")[0]
+        self._scratchpad += " " + thought
+
+        return thought
+
+    def generate_action(
+        self,
+        question: str,
+        examples: str,
+        reflections: str,
+        prompt: str,
+        additional_keys: Dict[str, str],
+        **kwargs: Any,
+    ) -> Tuple[str, str]:
+        """Generates an action based on the given question, examples, reflections, prompt, and additional keys.
+
+        Args:
+            question (str): The question to generate an action for.
+            examples (str): Examples to guide the action generation process.
+            reflections (str): Reflections to consider during the action generation process.
+            prompt (str): The prompt or instruction to guide the action generation.
+            additional_keys (Dict[str, str]): Additional keys for the action generation process.
+            kwargs (Dict[str, Any]): Additional keyword arguments.
+
+        Returns:
+            Tuple[str, str]: The generated action type and query.
+        """
+        max_steps = kwargs.get("max_steps", self.max_steps)
+        self._scratchpad += "\nAction:"
+        action = _prompt_react_agent(
+            llm=self.llm,
+            question=question,
+            examples=examples,
+            reflections=reflections,
+            scratchpad=self._scratchpad,
+            max_steps=max_steps,  # type: ignore
+            prompt=prompt,
+            additional_keys=additional_keys,
+        )
+        action = action.split("Observation")[0].strip()
+
+        action_type, query = parse_math_action_react(action)
+        self._scratchpad += f" {action_type}[\n```python\n{query}\n```\n]"
+
+        return action_type, query
+
+    def generate_observation(
+        self, step_idx: int, action_type: str, query: str, key: str
+    ) -> Tuple[bool, str, Dict[str, Any]]:
+        """Generate an observation based on the action type and query.
+
+        Args:
+            step_idx (int): The index of the current step.
+            action_type (str): The type of action to be performed.
+            query (str): The query for the action.
+            key (str): The key for the observation.
+
+        Returns:
+            Tuple[bool, str, Dict[str, Any]]: A tuple containing a boolean indicating whether the answer is correct, a string representing the observation,
+                and a dictionary of the external tool outputs.
+        """
+        external_tool_info = {"execution_status": "", "code_answer": ""}
+        code_answer, execution_status = safe_execute(query)
+
+        self._scratchpad += f"\nObservation {step_idx}: "
+        if action_type.lower() == "finish":
+            external_tool_info["code_answer"] = code_answer[0]
+            external_tool_info["execution_status"] = execution_status
+
+            self._answer = query
+            self._finished = True
+
+            if EM(code_answer[0], key, normalize=False):
+                obs = "Answer is CORRECT"
+            else:
+                obs = "Answer is INCORRECT"
+        elif action_type.lower() == "calculate":
+            external_tool_info["code_answer"] = code_answer[0]
+            external_tool_info["execution_status"] = execution_status
+
+            self._answer = query
+            obs = f"\n```python\n{self._answer}\n```\nExecution Status: {execution_status}\nOutput: answer = {code_answer[0]}"
+        else:
+            obs = (
+                "Invalid Action. Valid Actions are Calculate[code] and Finish[answer]."
+            )
+        self._scratchpad += obs
+
+        return EM(code_answer[0], key, normalize=False), obs, external_tool_info
+
+    def create_output_dict(
+        self, react_out: List[Dict[str, Any]], reflections: List[str]
+    ) -> Dict[str, Any]:
+        """Create a dictionary containing the output of the ReflexionReAct agent.
+
+        Args:
+            react_out (List[Dict[str, Any]]): The output of the ReflexionReAct agent, containing the thought, action type, query, observation, and whether the answer is correct for each step.
+            reflections (List[str]): The reflections generated by the ReflexionReAct agent.
+
+        Returns:
+            Dict[str, str]: A dictionary containing the 'react_output' and 'reflections'.
+        """
+        return {
+            "react_output": react_out,
+            "reflections": reflections,
+        }
+
+    def react_create_output_dict(
+        self,
+        thought: str,
+        action_type: str,
+        query: str,
+        obs: str,
+        external_tool_info: Dict[str, Any],
+        is_correct: bool,
+    ) -> Dict[str, Any]:
+        """Create a dictionary containing the output of a single step in the ReflexionReAct agent.
+
+        Args:
+            thought (str): The thought generated in the current step.
+            action_type (str): The type of action performed in the current step.
+            query (str): The query or information related to the action performed in the current step.
+            obs (str): The observation generated in the current step.
+            external_tool_info (Dict[str, Any]): The external tool outputs.
+            is_correct (bool): A boolean indicating whether the answer generated in the current step is correct.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing the 'thought', 'action_type', 'query', 'observation', 'answer', 'external_tool_info', and 'is_correct' of the current step.
+        """
+        return {
+            "thought": thought,
+            "action_type": action_type,
+            "query": query,
+            "observation": obs,
+            "answer": self._answer,
+            "external_tool_info": external_tool_info,
+            "is_correct": is_correct,
+        }
+
+    def halting_condition(self, idx: int, key: str, **kwargs: Any) -> bool:
+        """Determine whether the halting condition has been met.
+
+        Args:
+            idx (int): The current step index.
+            key (str): The key for the observation.
+            kwargs (Dict[str, Any]): Additional keyword arguments.
+
+        Returns:
+            bool: True if the halting condition is met, False otherwise. The halting condition is met when the answer is not correct and the current step index is less than the maximum number of trials plus one.
+        """
+        max_trials: int = kwargs.get("max_trials", self.max_trials)
+        code_answer, _ = safe_execute(self._answer)
+
+        return not EM(code_answer[0], key, normalize=False) and idx < max_trials + 1
+
+    def react_halting_condition(
+        self,
+        step_idx: int,
+        question: str,
+        examples: str,
+        reflections: str,
+        prompt: str,
+        additional_keys: Dict[str, str],
+        **kwargs: Any,
+    ) -> bool:
+        """Determine whether the halting condition has been met in the ReflexionReAct agent.
+
+        Args:
+            step_idx (int): The index of the current step.
+            question (str): The question to generate an action for.
+            examples (str): Examples to guide the action generation process.
+            reflections (str): Reflections to consider during the action generation process.
+            prompt (str): The prompt or instruction to guide the action generation.
+            additional_keys (Dict[str, str]): Additional keys for the action generation process.
+            kwargs (Dict[str, Any]): Additional keyword arguments.
+
+        Returns:
+            bool: True if the halting condition is met, False otherwise. The halting condition is met when the answer is not correct and the current step index is less than the maximum number of steps plus one.
+        """
+        max_steps = kwargs.get("max_steps", self.max_steps)
+
+        return _is_halted(
+            finished=self._finished,
+            step_idx=step_idx,
+            question=question,
+            scratchpad=self._scratchpad,
+            examples=examples,
+            reflections=reflections,
+            max_steps=max_steps,
+            max_tokens=self.max_tokens,
+            enc=self.enc,
+            prompt=prompt,
+            additional_keys=additional_keys,
+        )
+
+    def reset(self, **kwargs: Any) -> None:
+        """Resets the internal state of the strategy.
+
+        Resets the scratchpad and the finished flag.
+        Resets only the scratchpad if specified with 'only_scratchpad'.
+
+        Args:
+            **kwargs (Any): Additional keyword arguments.
+        """
+        no_reflector = kwargs.get("no_reflector", False)
+        if not no_reflector:
+            self.reflector.reset()
+        self._scratchpad = ""
+        self._finished = False
+        self._answer = ""
+
+    def reflect(
+        self,
+        reflect_strategy: str,
+        question: str,
+        examples: str,
+        prompt: str,
+        additional_keys: Dict[str, str],
+    ) -> Tuple[List[str], str]:
+        """Reflects on a given question, context, examples, prompt, and additional keys using the specified reflection strategy.
+
+        Args:
+            reflect_strategy (str): The strategy to use for reflection.
+            question (str): The question to be reflected upon.
+            examples (str): Examples to guide the reflection process.
+            prompt (str): The prompt or instruction to guide the reflection.
+            additional_keys (Dict[str, str]): Additional keys for the reflection process.
+
+        Returns:
+            Tuple[List[str], str]: The reflections and reflection string.
+        """
+        reflections, reflections_str = self.reflector.reflect(
+            reflect_strategy=reflect_strategy,
+            question=question,
+            examples=examples,
+            scratchpad=_truncate_scratchpad(
+                scratchpad=self._scratchpad, tokenizer=self.enc
+            ),
+            prompt=prompt,
+            additional_keys=additional_keys,
+        )
+
+        return reflections, reflections_str
+
+    def reflect_condition(
+        self,
+        step_idx: int,
+        reflect_strategy: Optional[str],
+        question: str,
+        examples: str,
+        key: str,
+        prompt: str,
+        additional_keys: Dict[str, str],
+        **kwargs: Dict[str, str],
+    ) -> bool:
+        """Determine whether the reflection condition has been met in the ReflexionReAct agent.
+
+        Args:
+            step_idx (int): The index of the current step.
+            reflect_strategy (Optional[str]): The strategy to use for reflection.
+            question (str): The question to be reflected upon.
+            examples (str): Examples to guide the reflection process.
+            key (str): The key for the observation.
+            prompt (str): The prompt or instruction to guide the reflection.
+            additional_keys (Dict[str, str]): Additional keys for the reflection process.
+            kwargs (Dict[str, str]): Additional keyword arguments.
+
+        Returns:
+            bool: True if the reflection condition is met, False otherwise. The reflection condition is met when the agent is halted, the answer is not correct, and the reflection strategy is provided.
+        """
+        max_steps = kwargs.get("max_steps", self.max_steps)
+
+        halted = _is_halted(
+            finished=self._finished,
+            step_idx=step_idx,
+            question=question,
+            scratchpad=self._scratchpad,
+            examples=examples,
+            reflections=self.reflector.reflections_str,
+            max_steps=max_steps,  # type: ignore
+            max_tokens=self.max_tokens,
+            enc=self.enc,
+            prompt=prompt,
+            additional_keys=additional_keys,
+        )
+
+        code_answer, _ = safe_execute(self._answer)
+
+        return (
+            halted
+            and not EM(code_answer[0], key, normalize=False)
+            and reflect_strategy is not None
+        )
+
+
+class ReflexionCoTGSM8KStrategy(ReflexionCoTMathStrategy):
+    """A strategy class for the GSM8K benchmark using the ReflexionCoT agent."""
+
+    pass
+
+
+class ReflexionCoTSVAMPStrategy(ReflexionCoTMathStrategy):
+    """A strategy class for the SVAMP benchmark using the ReflexionCoT agent."""
+
+    pass
+
+
+class ReflexionCoTTabMWPStrategy(ReflexionCoTMathStrategy):
+    """A strategy class for the TabMWP benchmark using the ReflexionCoT agent."""
+
+    pass
+
+
+class ReflexionReActGSM8KStrategy(ReflexionReActMathStrategy):
+    """A strategy class for the GSM8K benchmark using the ReflexionReAct agent."""
+
+    pass
+
+
+class ReflexionReActSVAMPStrategy(ReflexionReActMathStrategy):
+    """A strategy class for the SVAMP benchmark using the ReflexionReAct agent."""
+
+    pass
+
+
+class ReflexionReActTabMWPStrategy(ReflexionReActMathStrategy):
+    """A strategy class for the TabMWP benchmark using the ReflexionReAct agent."""
+
+    pass
diff --git a/agential/cog/strategies/reflexion/qa.py b/agential/cog/strategies/reflexion/qa.py
index d0b162f8d..7184a7665 100644
--- a/agential/cog/strategies/reflexion/qa.py
+++ b/agential/cog/strategies/reflexion/qa.py
@@ -169,7 +169,7 @@ def generate_observation(
             key (str): The key for the observation.
 
         Returns:
-            Tuple[bool, str]: The generated observation.
+            Tuple[bool, str]: A boolean indicating correctness and the generated observation.
         """
         self._scratchpad += f"\nObservation: "
         if action_type.lower() == "finish":
@@ -203,12 +203,12 @@ def create_output_dict(
             reflections (List[str]): The reflections.
 
         Returns:
-            Dict[str, Any]: A dictionary containing the thought, action type, observation, answer, and is_correct.
+            Dict[str, Any]: A dictionary containing the thought, action type, observation, answer, is_correct, and reflections.
         """
         return {
             "thought": thought,
             "action_type": action_type,
-            "obs": obs,
+            "observation": obs,
             "answer": self._answer,
             "is_correct": is_correct,
             "reflections": reflections,
@@ -241,9 +241,6 @@ def reset(self, **kwargs: Any) -> None:
 
         Args:
             **kwargs (Any): Additional arguments.
-
-        Returns:
-            None
         """
         only_scratchpad = kwargs.get("only_scratchpad", False)
         if only_scratchpad:
@@ -312,7 +309,7 @@ class ReflexionReActQAStrategy(ReflexionReActBaseStrategy):
         max_reflections (int): The maximum number of reflections allowed. Defaults to 3.
         max_trials (int): The maximum number of trials allowed. Defaults to 1.
         max_steps (int): The maximum number of steps allowed. Defaults to 6.
-        max_tokens (int): The maximum number of tokens allowed. Defaults to 3896.
+        max_tokens (int): The maximum number of tokens allowed. Defaults to 5000.
         docstore (DocstoreExplorer): The document store explorer for retrieving relevant documents. Defaults to Wikipedia.
         enc (Encoding): The encoding for tokenization. Defaults to gpt-3.5-turbo.
     """
@@ -324,7 +321,7 @@ def __init__(
         max_reflections: int = 3,
         max_trials: int = 1,
         max_steps: int = 6,
-        max_tokens: int = 3896,
+        max_tokens: int = 5000,
         docstore: DocstoreExplorer = DocstoreExplorer(Wikipedia()),
         enc: Encoding = tiktoken.encoding_for_model("gpt-3.5-turbo"),
     ) -> None:
@@ -434,7 +431,7 @@ def generate_observation(
         action_type: str,
         query: str,
         key: str,
-    ) -> Tuple[bool, str]:
+    ) -> Tuple[bool, str, Dict[str, Any]]:
         """Generate an observation based on the action type and query.
 
         Args:
@@ -444,8 +441,11 @@ def generate_observation(
             key (str): The key for the observation.
 
         Returns:
-            Tuple[bool, str]: A tuple containing a boolean indicating whether the answer is correct, and a string representing the observation.
+            Tuple[bool, str, Dict[str, Any]]: A tuple containing a boolean indicating whether the answer is correct, a string representing the observation,
+                and a dictionary of the external tool outputs.
         """
+        external_tool_info = {"search_result": "", "lookup_result": ""}
+
         self._scratchpad += f"\nObservation {step_idx}: "
         if action_type.lower() == "finish":
             self._answer = query
@@ -456,19 +456,23 @@ def generate_observation(
                 obs = "Answer is INCORRECT"
         elif action_type.lower() == "search":
             try:
-                obs = remove_newline(self.docstore.search(query))
+                search_result = self.docstore.search(query)
+                external_tool_info["search_result"] = search_result
+                obs = remove_newline(search_result)
             except Exception:
                 obs = "Could not find that page, please try again."
         elif action_type.lower() == "lookup":
             try:
-                obs = remove_newline(self.docstore.lookup(query))
+                lookup_result = self.docstore.lookup(query)
+                external_tool_info["lookup_result"] = lookup_result
+                obs = remove_newline(lookup_result)
             except ValueError:
                 obs = "The last page Searched was not found, so you cannot Lookup a keyword in it. Please try one of the similar pages given."
         else:
             obs = "Invalid Action. Valid Actions are Lookup[<topic>] Search[<topic>] and Finish[<answer>]."
         self._scratchpad += obs
 
-        return EM(self._answer, key), obs
+        return EM(self._answer, key), obs, external_tool_info
 
     def create_output_dict(
         self,
@@ -490,7 +494,13 @@ def create_output_dict(
         }
 
     def react_create_output_dict(
-        self, thought: str, action_type: str, query: str, obs: str, is_correct: bool
+        self,
+        thought: str,
+        action_type: str,
+        query: str,
+        obs: str,
+        external_tool_info: Dict[str, Any],
+        is_correct: bool,
     ) -> Dict[str, Any]:
         """Create a dictionary containing the output of a single step in the ReflexionReAct agent.
 
@@ -499,16 +509,19 @@ def react_create_output_dict(
             action_type (str): The type of action performed in the current step.
             query (str): The query or information related to the action performed in the current step.
             obs (str): The observation generated in the current step.
+            external_tool_info (Dict[str, Any]): The external tool outputs.
             is_correct (bool): A boolean indicating whether the answer generated in the current step is correct.
 
         Returns:
-            Dict[str, Any]: A dictionary containing the 'thought', 'action_type', 'query', 'observation', and 'is_correct' of the current step.
+            Dict[str, Any]: A dictionary containing the 'thought', 'action_type', 'query', 'observation', 'answer', 'external_tool_info', and 'is_correct' of the current step.
         """
         return {
             "thought": thought,
             "action_type": action_type,
             "query": query,
             "observation": obs,
+            "answer": self._answer,
+            "external_tool_info": external_tool_info,
             "is_correct": is_correct,
         }
 
diff --git a/agential/cog/strategies/strategy_factory.py b/agential/cog/strategies/strategy_factory.py
index 34bf5b6a5..6f0cf3a36 100644
--- a/agential/cog/strategies/strategy_factory.py
+++ b/agential/cog/strategies/strategy_factory.py
@@ -35,6 +35,14 @@
     ReflexionCoTBaseStrategy,
     ReflexionReActBaseStrategy,
 )
+from agential.cog.strategies.reflexion.math import (
+    ReflexionCoTGSM8KStrategy,
+    ReflexionCoTSVAMPStrategy,
+    ReflexionCoTTabMWPStrategy,
+    ReflexionReActGSM8KStrategy,
+    ReflexionReActSVAMPStrategy,
+    ReflexionReActTabMWPStrategy,
+)
 from agential.cog.strategies.reflexion.qa import (
     ReflexionCoTAmbigNQStrategy,
     ReflexionCoTFEVERStrategy,
@@ -210,11 +218,11 @@ def get_strategy(
                 raise ValueError(f"Unsupported QA benchmark: {mode['qa']}")
         elif "math" in mode:
             if mode["math"] == "gsm8k":
-                pass
+                return ReflexionCoTGSM8KStrategy(**strategy_kwargs)
             elif mode["math"] == "svamp":
-                pass
+                return ReflexionCoTSVAMPStrategy(**strategy_kwargs)
             elif mode["math"] == "tabmwp":
-                pass
+                return ReflexionCoTTabMWPStrategy(**strategy_kwargs)
             else:
                 raise ValueError(f"Unsupported Math benchmark: {mode['math']}")
         elif "code" in mode:
@@ -273,11 +281,11 @@ def get_strategy(
                 raise ValueError(f"Unsupported QA benchmark: {mode['qa']}")
         elif "math" in mode:
             if mode["math"] == "gsm8k":
-                pass
+                return ReflexionReActGSM8KStrategy(**strategy_kwargs)
             elif mode["math"] == "svamp":
-                pass
+                return ReflexionReActSVAMPStrategy(**strategy_kwargs)
             elif mode["math"] == "tabmwp":
-                pass
+                return ReflexionReActTabMWPStrategy(**strategy_kwargs)
             else:
                 raise ValueError(f"Unsupported Math benchmark: {mode['math']}")
         elif "code" in mode:
diff --git a/docs/docs/index.md b/docs/docs/index.md
index 3ef34d45c..73061960d 100644
--- a/docs/docs/index.md
+++ b/docs/docs/index.md
@@ -38,7 +38,7 @@
 | **Methods / Benchmarks** |        GSM8k         |        SVAMP         |        TabMWP        |
 | ------------------------ | :------------------: | :------------------: | :------------------: |
 | ReAct                    | :octicons-check-16:  | :octicons-check-16:  | :octicons-check-16:  |
-| Reflexion                |                      |                      |                      |
+| Reflexion                | :octicons-check-16:  | :octicons-check-16:  | :octicons-check-16:  |
 | CRITIC                   | :material-check-all: | :material-check-all: | :material-check-all: |
 | LATS                     |                      |                      |                      |
 
diff --git a/notebooks/react.ipynb b/notebooks/react.ipynb
index 9b9f4c623..9b235a876 100644
--- a/notebooks/react.ipynb
+++ b/notebooks/react.ipynb
@@ -88,7 +88,7 @@
     "    llm=llm,\n",
     "    mode={\"qa\": \"hotpotqa\"},\n",
     "    max_steps=8,\n",
-    "    max_tokens=3896,\n",
+    "    max_tokens=5000,\n",
     "    docstore=DocstoreExplorer(Wikipedia()),\n",
     "    enc=tiktoken.encoding_for_model(\"gpt-3.5-turbo\"),\n",
     ")\n",
@@ -132,7 +132,7 @@
     "    llm=llm,\n",
     "    mode={\"qa\": \"fever\"},\n",
     "    max_steps=8,\n",
-    "    max_tokens=3896,\n",
+    "    max_tokens=5000,\n",
     "    docstore=DocstoreExplorer(Wikipedia()),\n",
     "    enc=tiktoken.encoding_for_model(\"gpt-3.5-turbo\"),\n",
     ")\n",
@@ -176,7 +176,7 @@
     "    llm=llm,\n",
     "    mode={\"qa\": \"ambignq\"},\n",
     "    max_steps=8,\n",
-    "    max_tokens=3896,\n",
+    "    max_tokens=5000,\n",
     "    docstore=DocstoreExplorer(Wikipedia()),\n",
     "    enc=tiktoken.encoding_for_model(\"gpt-3.5-turbo\"),\n",
     ")\n",
@@ -220,7 +220,7 @@
     "    llm=llm,\n",
     "    mode={\"qa\": \"triviaqa\"},\n",
     "    max_steps=8,\n",
-    "    max_tokens=3896,\n",
+    "    max_tokens=5000,\n",
     "    docstore=DocstoreExplorer(Wikipedia()),\n",
     "    enc=tiktoken.encoding_for_model(\"gpt-3.5-turbo\"),\n",
     ")\n",
@@ -387,7 +387,7 @@
     "    llm=llm,\n",
     "    mode={\"code\": \"humaneval\"},\n",
     "    max_steps=6,\n",
-    "    max_tokens=3896,\n",
+    "    max_tokens=5000,\n",
     "    enc=tiktoken.encoding_for_model(\"gpt-3.5-turbo\"),\n",
     ")\n",
     "\n",
@@ -433,7 +433,7 @@
     "    llm=llm,\n",
     "    mode={\"code\": \"mbpp\"},\n",
     "    max_steps=6,\n",
-    "    max_tokens=3896,\n",
+    "    max_tokens=5000,\n",
     "    enc=tiktoken.encoding_for_model(\"gpt-3.5-turbo\"),\n",
     ")\n",
     "\n",
diff --git a/notebooks/reflexion.ipynb b/notebooks/reflexion.ipynb
index 8fa4005bd..ed40222d7 100644
--- a/notebooks/reflexion.ipynb
+++ b/notebooks/reflexion.ipynb
@@ -34,7 +34,28 @@
     "    REFLEXION_COT_INSTRUCTION_TRIVIAQA,\n",
     "    REFLEXION_REACT_INSTRUCTION_TRIVIAQA,\n",
     "    REFLEXION_COT_REFLECT_INSTRUCTION_TRIVIAQA,\n",
-    "    REFLEXION_REACT_REFLECT_INSTRUCTION_TRIVIAQA\n",
+    "    REFLEXION_REACT_REFLECT_INSTRUCTION_TRIVIAQA,\n",
+    "\n",
+    "    GSM8K_FEWSHOT_EXAMPLES_REFLEXION_COT_REFLECT,\n",
+    "    GSM8K_FEWSHOT_EXAMPLES_REFLEXION_REACT_REFLECT,\n",
+    "    REFLEXION_COT_INSTRUCTION_GSM8K,\n",
+    "    REFLEXION_REACT_INSTRUCTION_GSM8K,\n",
+    "    REFLEXION_COT_REFLECT_INSTRUCTION_GSM8K,\n",
+    "    REFLEXION_REACT_REFLECT_INSTRUCTION_GSM8K,\n",
+    "\n",
+    "    SVAMP_FEWSHOT_EXAMPLES_REFLEXION_COT_REFLECT,\n",
+    "    SVAMP_FEWSHOT_EXAMPLES_REFLEXION_REACT_REFLECT,\n",
+    "    REFLEXION_COT_INSTRUCTION_SVAMP,\n",
+    "    REFLEXION_REACT_INSTRUCTION_SVAMP,\n",
+    "    REFLEXION_COT_REFLECT_INSTRUCTION_SVAMP,\n",
+    "    REFLEXION_REACT_REFLECT_INSTRUCTION_SVAMP,\n",
+    "\n",
+    "    TABMWP_FEWSHOT_EXAMPLES_REFLEXION_COT_REFLECT,\n",
+    "    TABMWP_FEWSHOT_EXAMPLES_REFLEXION_REACT_REFLECT,\n",
+    "    REFLEXION_COT_INSTRUCTION_TABMWP,\n",
+    "    REFLEXION_REACT_INSTRUCTION_TABMWP,\n",
+    "    REFLEXION_COT_REFLECT_INSTRUCTION_TABMWP,\n",
+    "    REFLEXION_REACT_REFLECT_INSTRUCTION_TABMWP\n",
     ")\n",
     "from agential.cog.prompts.benchmark.hotpotqa import (\n",
     "    HOTPOTQA_FEWSHOT_EXAMPLES_COT,\n",
@@ -52,8 +73,22 @@
     "    AMBIGNQ_FEWSHOT_EXAMPLES_COT,\n",
     "    AMBIGNQ_FEWSHOT_EXAMPLES_REACT,\n",
     ")\n",
-    "\n",
-    "\n",
+    "from agential.cog.prompts.benchmark.gsm8k import (\n",
+    "    GSM8K_FEWSHOT_EXAMPLES_COT,\n",
+    "    GSM8K_FEWSHOT_EXAMPLES_REACT\n",
+    ")\n",
+    "from agential.cog.prompts.benchmark.svamp import (\n",
+    "    SVAMP_FEWSHOT_EXAMPLES_COT,\n",
+    "    SVAMP_FEWSHOT_EXAMPLES_REACT\n",
+    ")\n",
+    "from agential.cog.prompts.benchmark.tabmwp import (\n",
+    "    TABMWP_FEWSHOT_EXAMPLES_COT,\n",
+    "    TABMWP_FEWSHOT_EXAMPLES_REACT\n",
+    ")\n",
+    "from agential.cog.modules.reflect.reflexion import (\n",
+    "    ReflexionCoTReflector,\n",
+    "    ReflexionReActReflector,\n",
+    ")\n",
     "import tiktoken\n",
     "from langchain_community.docstore.wikipedia import Wikipedia\n",
     "from agential.utils.docstore import DocstoreExplorer\n",
@@ -97,6 +132,7 @@
     "agent = ReflexionCoTAgent(\n",
     "    llm=llm,\n",
     "    mode={\"qa\": \"hotpotqa\"},\n",
+    "    reflector=ReflexionCoTReflector(llm=llm),\n",
     "    max_reflections=3,\n",
     "    max_trials=1,\n",
     ")\n",
@@ -138,8 +174,9 @@
     "agent = ReflexionReActAgent(\n",
     "    llm=llm,\n",
     "    mode={\"qa\": \"hotpotqa\"},\n",
+    "    reflector=ReflexionReActReflector(llm=llm),\n",
     "    max_steps=6,\n",
-    "    max_tokens=3896,\n",
+    "    max_tokens=5000,\n",
     "    docstore=DocstoreExplorer(Wikipedia()),\n",
     "    enc=tiktoken.encoding_for_model(\"gpt-3.5-turbo\"),\n",
     ")\n",
@@ -189,6 +226,7 @@
     "agent = ReflexionCoTAgent(\n",
     "    llm=llm,\n",
     "    mode={\"qa\": \"fever\"},\n",
+    "    reflector=ReflexionCoTReflector(llm=llm),\n",
     "    max_reflections=3,\n",
     "    max_trials=1,\n",
     ")\n",
@@ -230,8 +268,11 @@
     "agent = ReflexionReActAgent(\n",
     "    llm=llm,\n",
     "    mode={\"qa\": \"fever\"},\n",
+    "    reflector=ReflexionReActReflector(llm=llm),\n",
+    "    max_reflections=3,\n",
+    "    max_trials=1,\n",
     "    max_steps=6,\n",
-    "    max_tokens=3896,\n",
+    "    max_tokens=5000,\n",
     "    docstore=DocstoreExplorer(Wikipedia()),\n",
     "    enc=tiktoken.encoding_for_model(\"gpt-3.5-turbo\"),\n",
     ")\n",
@@ -280,6 +321,7 @@
     "agent = ReflexionCoTAgent(\n",
     "    llm=llm,\n",
     "    mode={\"qa\": \"ambignq\"},\n",
+    "    reflector=ReflexionCoTReflector(llm=llm),\n",
     "    max_reflections=3,\n",
     "    max_trials=1,\n",
     ")\n",
@@ -320,8 +362,11 @@
     "agent = ReflexionReActAgent(\n",
     "    llm=llm,\n",
     "    mode={\"qa\": \"ambignq\"},\n",
+    "    reflector=ReflexionReActReflector(llm=llm),\n",
+    "    max_reflections=3,\n",
+    "    max_trials=1,\n",
     "    max_steps=6,\n",
-    "    max_tokens=3896,\n",
+    "    max_tokens=5000,\n",
     "    docstore=DocstoreExplorer(Wikipedia()),\n",
     "    enc=tiktoken.encoding_for_model(\"gpt-3.5-turbo\"),\n",
     ")\n",
@@ -370,6 +415,7 @@
     "agent = ReflexionCoTAgent(\n",
     "    llm=llm,\n",
     "    mode={\"qa\": \"triviaqa\"},\n",
+    "    reflector=ReflexionCoTReflector(llm=llm),\n",
     "    max_reflections=3,\n",
     "    max_trials=1,\n",
     ")\n",
@@ -410,8 +456,11 @@
     "agent = ReflexionReActAgent(\n",
     "    llm=llm,\n",
     "    mode={\"qa\": \"triviaqa\"},\n",
+    "    reflector=ReflexionReActReflector(llm=llm),\n",
+    "    max_reflections=3,\n",
+    "    max_trials=1,\n",
     "    max_steps=6,\n",
-    "    max_tokens=3896,\n",
+    "    max_tokens=5000,\n",
     "    docstore=DocstoreExplorer(Wikipedia()),\n",
     "    enc=tiktoken.encoding_for_model(\"gpt-3.5-turbo\"),\n",
     ")\n",
@@ -455,6 +504,93 @@
     "### GSM8K"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question = \"Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with 4933828. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\"\n",
+    "key = -9867630\n",
+    "\n",
+    "agent = ReflexionCoTAgent(\n",
+    "    llm=llm, \n",
+    "    mode={\"math\": \"gsm8k\"},\n",
+    "    reflector=ReflexionCoTReflector(llm=llm),\n",
+    "    max_reflections=3,\n",
+    "    max_trials=1\n",
+    ")\n",
+    "\n",
+    "out = agent.generate(\n",
+    "    question=question,\n",
+    "    key=key,\n",
+    "    examples=GSM8K_FEWSHOT_EXAMPLES_COT,\n",
+    "    prompt=REFLEXION_COT_INSTRUCTION_GSM8K,\n",
+    "    reflect_examples=GSM8K_FEWSHOT_EXAMPLES_REFLEXION_COT_REFLECT,\n",
+    "    reflect_prompt=REFLEXION_COT_REFLECT_INSTRUCTION_GSM8K,\n",
+    "    reflect_strategy=\"reflexion\",\n",
+    "    additional_keys={},\n",
+    "    reflect_additional_keys={},\n",
+    "    max_trials=3,\n",
+    "    patience=3,\n",
+    "    reset=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question = \"Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with 4933828. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\"\n",
+    "key = -9867630\n",
+    "\n",
+    "agent = ReflexionReActAgent(\n",
+    "    llm=llm,\n",
+    "    mode={\"math\": \"gsm8k\"},\n",
+    "    reflector=ReflexionReActReflector(llm=llm),\n",
+    "    max_reflections=3,\n",
+    "    max_trials=1,\n",
+    "    max_steps=6,\n",
+    "    max_tokens=5000,\n",
+    "    enc=tiktoken.encoding_for_model(\"gpt-3.5-turbo\"),\n",
+    ")\n",
+    "out = agent.generate(\n",
+    "    question=question, \n",
+    "    key=key, \n",
+    "    examples=GSM8K_FEWSHOT_EXAMPLES_REACT,\n",
+    "    prompt=REFLEXION_REACT_INSTRUCTION_GSM8K, \n",
+    "    reflect_examples=GSM8K_FEWSHOT_EXAMPLES_REFLEXION_REACT_REFLECT,\n",
+    "    reflect_prompt=REFLEXION_REACT_REFLECT_INSTRUCTION_GSM8K,\n",
+    "    reflect_strategy=\"reflexion\",\n",
+    "    additional_keys={},\n",
+    "    reflect_additional_keys={},\n",
+    "    max_steps=6,\n",
+    "    max_trials=3,\n",
+    "    patience=3,\n",
+    "    reset=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -462,6 +598,92 @@
     "### SVAMP"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question = \"There are 87 oranges and 290 bananas in Philip's collection. If the bananas are organized into 2 groups and oranges are organized into 93 groups. How big is each group of bananas?\"\n",
+    "key = 145\n",
+    "\n",
+    "agent = ReflexionCoTAgent(\n",
+    "    llm=llm, \n",
+    "    mode={\"math\": \"svamp\"},\n",
+    "    reflector=ReflexionCoTReflector(llm=llm),\n",
+    "    max_reflections=3,\n",
+    "    max_trials=1\n",
+    ")\n",
+    "out = agent.generate(\n",
+    "    question=question,\n",
+    "    key=key,\n",
+    "    examples=SVAMP_FEWSHOT_EXAMPLES_COT,\n",
+    "    prompt=REFLEXION_COT_INSTRUCTION_SVAMP,\n",
+    "    reflect_examples=SVAMP_FEWSHOT_EXAMPLES_REFLEXION_COT_REFLECT,\n",
+    "    reflect_prompt=REFLEXION_COT_REFLECT_INSTRUCTION_SVAMP,\n",
+    "    reflect_strategy=\"reflexion\",\n",
+    "    additional_keys={},\n",
+    "    reflect_additional_keys={},\n",
+    "    max_trials=3,\n",
+    "    patience=3,\n",
+    "    reset=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question = \"There are 87 oranges and 290 bananas in Philip's collection. If the bananas are organized into 2 groups and oranges are organized into 93 groups. How big is each group of bananas?\"\n",
+    "key = 145\n",
+    "\n",
+    "agent = ReflexionReActAgent(\n",
+    "    llm=llm,\n",
+    "    mode={\"math\": \"svamp\"},\n",
+    "    reflector=ReflexionReActReflector(llm=llm),\n",
+    "    max_reflections=3,\n",
+    "    max_trials=1,\n",
+    "    max_steps=6,\n",
+    "    max_tokens=5000,\n",
+    "    enc=tiktoken.encoding_for_model(\"gpt-3.5-turbo\"),\n",
+    ")\n",
+    "out = agent.generate(\n",
+    "    question=question, \n",
+    "    key=key, \n",
+    "    examples=SVAMP_FEWSHOT_EXAMPLES_REACT,\n",
+    "    prompt=REFLEXION_REACT_INSTRUCTION_SVAMP, \n",
+    "    reflect_examples=SVAMP_FEWSHOT_EXAMPLES_REFLEXION_REACT_REFLECT,\n",
+    "    reflect_prompt=REFLEXION_REACT_REFLECT_INSTRUCTION_SVAMP,\n",
+    "    reflect_strategy=\"reflexion\",\n",
+    "    additional_keys={},\n",
+    "    reflect_additional_keys={},\n",
+    "    max_steps=6,\n",
+    "    max_trials=3,\n",
+    "    patience=3,\n",
+    "    reset=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -470,30 +692,127 @@
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "# Code"
+    "question = \"\"\"Read the following table regarding \"Bowling Scores\" and then write Python code to answer a question:\n",
+    "\n",
+    "Name | Score\n",
+    "Amanda | 117\n",
+    "Sam | 236\n",
+    "Irma | 144\n",
+    "Mike | 164\n",
+    "\n",
+    "Question: Some friends went bowling and kept track of their scores. How many more points did Mike score than Irma?\"\"\"\n",
+    "key = 20\n",
+    "\n",
+    "agent = ReflexionCoTAgent(\n",
+    "    llm=llm, \n",
+    "    mode={\"math\": \"tabmwp\"},\n",
+    "    reflector=ReflexionCoTReflector(llm=llm),\n",
+    "    max_reflections=3,\n",
+    "    max_trials=1\n",
+    ")\n",
+    "out = agent.generate(\n",
+    "    question=question,\n",
+    "    key=key,\n",
+    "    examples=TABMWP_FEWSHOT_EXAMPLES_COT,\n",
+    "    prompt=REFLEXION_COT_INSTRUCTION_TABMWP,\n",
+    "    reflect_examples=TABMWP_FEWSHOT_EXAMPLES_REFLEXION_COT_REFLECT,\n",
+    "    reflect_prompt=REFLEXION_COT_REFLECT_INSTRUCTION_TABMWP,\n",
+    "    reflect_strategy=\"reflexion\",\n",
+    "    additional_keys={},\n",
+    "    reflect_additional_keys={},\n",
+    "    max_trials=3,\n",
+    "    patience=3,\n",
+    "    reset=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question = \"\"\"Read the following table regarding \"Bowling Scores\" and then write Python code to answer a question:\n",
+    "\n",
+    "Name | Score\n",
+    "Amanda | 117\n",
+    "Sam | 236\n",
+    "Irma | 144\n",
+    "Mike | 164\n",
+    "\n",
+    "Question: Some friends went bowling and kept track of their scores. How many more points did Mike score than Irma?\"\"\"\n",
+    "key = 20\n",
+    "\n",
+    "agent = ReflexionReActAgent(\n",
+    "    llm=llm,\n",
+    "    mode={\"math\": \"tabmwp\"},\n",
+    "    reflector=ReflexionReActReflector(llm=llm),\n",
+    "    max_reflections=3,\n",
+    "    max_trials=1,\n",
+    "    max_steps=6,\n",
+    "    max_tokens=5000,\n",
+    "    enc=tiktoken.encoding_for_model(\"gpt-3.5-turbo\"),\n",
+    ")\n",
+    "out = agent.generate(\n",
+    "    question=question, \n",
+    "    key=key, \n",
+    "    examples=TABMWP_FEWSHOT_EXAMPLES_REACT,\n",
+    "    prompt=REFLEXION_REACT_INSTRUCTION_TABMWP, \n",
+    "    reflect_examples=TABMWP_FEWSHOT_EXAMPLES_REFLEXION_REACT_REFLECT,\n",
+    "    reflect_prompt=REFLEXION_REACT_REFLECT_INSTRUCTION_TABMWP,\n",
+    "    reflect_strategy=\"reflexion\",\n",
+    "    additional_keys={},\n",
+    "    reflect_additional_keys={},\n",
+    "    max_steps=6,\n",
+    "    max_trials=3,\n",
+    "    patience=3,\n",
+    "    reset=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### HumanEval"
+    "# Code"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### MBPP"
+    "### HumanEval"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": []
+   "source": [
+    "### MBPP"
+   ]
   }
  ],
  "metadata": {
diff --git a/tests/cog/agent/test_critic.py b/tests/cog/agent/test_critic.py
index 49abca947..321bfff41 100644
--- a/tests/cog/agent/test_critic.py
+++ b/tests/cog/agent/test_critic.py
@@ -67,6 +67,13 @@ def test_generate() -> None:
 
     # Test "qa" mode with search tool.
     search = MagicMock(spec=GoogleSerperAPIWrapper)
+    search.results.return_value = [
+        {
+            "title": "agential-ai/agential: The encyclopedia of LLM-based agents - GitHub",
+            "link": "https://github.com/alckasoc/agential",
+            "snippet": '\'Who was once considered the best kick boxer in the world, however he has been involved in a number of controversies relating to his "unsportsmanlike conducts"\xa0...',
+        }
+    ]
     responses = [
         "Let's break it down step by step. The kickboxer who fits this description is Badr Hari. So the answer is: Badr Hari.",
         'The question asks for a kickboxer who fits the description provided, and the answer "Badr Hari" is a plausible response.\n\n2. Truthfulness:\n\nLet\'s search the question in Google:\n\n> Search Query: Who was once considered the best kick boxer in the world, however he has been involved in a number of controversies relating to his "unsportsmanlike conducts" in the sport and crimes of violence outside of the ring\n> Evidence: [Badr Hari - Wikipedia] Badr Hari is a Moroccan-Dutch super heavyweight kickboxer from the Netherlands, fighting out of Mike\'s Gym in Oostzaan. He is a former K-1 Heavyweight Champion (2007-2008) and It\'s Showtime Heavyweight Champion (2009-2010).\n\nThe evidence confirms that Badr Hari fits the description provided in the question.\n\nOverall, the proposed answer is both plausible and truthful.\n\nQuestion: Who was once considered the best kick boxer in the world, however he has been involved in a number of controversies relating to his "unsportsmanlike conducts" in the sport and crimes of violence outside of the ring?\nHere\'s the most possible answer: Let\'s break it down step by step. The kickboxer who fits this description is Badr Hari. So the answer is: Badr Hari.',
diff --git a/tests/cog/agent/test_expel.py b/tests/cog/agent/test_expel.py
index f2bd3feb6..130ebb6d3 100644
--- a/tests/cog/agent/test_expel.py
+++ b/tests/cog/agent/test_expel.py
@@ -144,21 +144,36 @@ def test_gather_experience(hotpotqa_distractor_sample_path: str) -> None:
                             "thought": "I need to search for the giant silverware company that was started as a religious Utopian group and was run by Pierrepont Noyes.",
                             "action_type": "Search",
                             "query": "giant silverware company started as religious Utopian group Pierrepont Noyes",
-                            "observation": "Could not find [giant silverware company started as religious Utopian group Pierrepont Noyes]. Similar: []",
+                            "observation": "Search result",
+                            "answer": "",
+                            "external_tool_info": {
+                                "search_result": "Search result",
+                                "lookup_result": "",
+                            },
                             "is_correct": False,
                         },
                         {
                             "thought": "The search query was too specific. I should try searching for the silverware company and then look for information about its history and founder.",
                             "action_type": "Search",
                             "query": "silverware company Pierrepont Noyes",
-                            "observation": "Pierrepont Burt Noyes (August 18, 1870 – April 15, 1959) was an American businessman and writer. He was brought up in the Oneida Community, a religious Utopian group. Noyes later became the head of Oneida Limited, a position he held for many years.",
+                            "observation": "Search result",
+                            "answer": "",
+                            "external_tool_info": {
+                                "search_result": "Search result",
+                                "lookup_result": "",
+                            },
                             "is_correct": False,
                         },
                         {
                             "thought": "Pierrepont Noyes was the head of Oneida Limited, a silverware company. I need to confirm if Oneida Limited was indeed started as a religious Utopian group.",
                             "action_type": "Search",
                             "query": "Oneida Limited religious Utopian group",
-                            "observation": "The Oneida Community ( oh-NYE-də) was a perfectionist religious communal society founded by John Humphrey Noyes and his followers in 1848 near Oneida, New York. The community believed that Jesus had already returned in AD 70, making it possible for them to bring about Jesus's millennial kingdom themselves, and be perfect and free of sin in this world, not just in Heaven (a belief called perfectionism). The Oneida Community practiced communalism (in the sense of communal property and possessions), group marriage, male sexual continence, Oneida stirpiculture (a form of eugenics), and mutual criticism.The community's original 87 members grew to 172 by February 1850, 208 by 1852, and 306 by 1878. There were smaller Noyesian communities in Wallingford, Connecticut; Newark, New Jersey; Putney and Cambridge, Vermont. The branches were closed in 1854 except for the Wallingford branch, which operated until the 1878 tornado devastated it.The Oneida Community dissolved in 1881, converting itself to a joint-stock company. This eventually became the silverware company Oneida Limited, one of the largest in the world.",
+                            "observation": "Search result",
+                            "answer": "",
+                            "external_tool_info": {
+                                "search_result": "Search result",
+                                "lookup_result": "",
+                            },
                             "is_correct": False,
                         },
                         {
@@ -166,6 +181,11 @@ def test_gather_experience(hotpotqa_distractor_sample_path: str) -> None:
                             "action_type": "Finish",
                             "query": "Oneida Limited",
                             "observation": "Answer is CORRECT",
+                            "answer": "Oneida Limited",
+                            "external_tool_info": {
+                                "search_result": "",
+                                "lookup_result": "",
+                            },
                             "is_correct": True,
                         },
                     ],
@@ -188,9 +208,12 @@ def test_gather_experience(hotpotqa_distractor_sample_path: str) -> None:
     ]
     llm = FakeListChatModel(responses=action_responses)
     agent = ExpeLAgent(llm=llm, mode={"qa": "hotpotqa"})
+    agent.reflexion_react_agent.strategy.docstore.search = lambda x: "Search result"
+    agent.reflexion_react_agent.strategy.docstore.lookup = lambda x: "Lookup result"
     new_experiences = agent.gather_experience(
         questions=hotpotqa.question.values[-1:], keys=hotpotqa.answer.values[-1:]
     )
+
     assert new_experiences == gt_new_experiences
     assert new_experiences == agent.experience_memory.experiences
     assert len(agent.experience_memory.success_traj_docs) == 13
@@ -297,21 +320,36 @@ def test_generate(expel_experiences_10_fake_path: str) -> None:
                             "thought": "I need to find out the name of the giant silverware company that was started as a religious Utopian group and was run by Pierrepont Noyes for many years. I should search for the company's name and its history to gather more information. ",
                             "action_type": "Search",
                             "query": "giant silverware company religious Utopian group Pierrepont Noyes",
-                            "observation": "Pierrepont Burt Noyes (August 18, 1870 – April 15, 1959) was an American businessman and writer. He was brought up in the Oneida Community, a religious Utopian group. Noyes later became the head of Oneida Limited, a position he held for many years.",
+                            "observation": "Search result",
+                            "answer": "",
+                            "external_tool_info": {
+                                "search_result": "Search result",
+                                "lookup_result": "",
+                            },
                             "is_correct": False,
                         },
                         {
                             "thought": "I need to search for more information about Oneida Limited to see if it matches the description of a giant silverware company that was started as a religious Utopian group and was run by Pierrepont Noyes. I should look for details about the company's history and any relevant connections to Noyes. ",
                             "action_type": "Search",
                             "query": "Oneida Limited giant silverware company religious Utopian group Pierrepont Noyes",
-                            "observation": "Pierrepont Burt Noyes (August 18, 1870 – April 15, 1959) was an American businessman and writer. He was brought up in the Oneida Community, a religious Utopian group. Noyes later became the head of Oneida Limited, a position he held for many years.",
+                            "observation": "Search result",
+                            "answer": "",
+                            "external_tool_info": {
+                                "search_result": "Search result",
+                                "lookup_result": "",
+                            },
                             "is_correct": False,
                         },
                         {
                             "thought": 'It seems like the search results are not providing specific information about Oneida Limited as a giant silverware company. I should try to look up the keyword "silverware" in the last passage successfully found and see if it provides any relevant details about Oneida Limited\'s products.',
                             "action_type": "Lookup",
                             "query": "silverware",
-                            "observation": '(Result 1/1) == Oneida Limited ==After studying at Colgate University, followed by Harvard University, P.B. Noyes joined Oneida Limited, the company which emerged from the commune after his father\'s death. He went on to become president of the company, steering it towards specialising in silverware and stainless steel cutlery. In 1894, he married another stirpicult, Corinna Ackley Kinsley (Also his half-niece), and the couple had three children.As the head of Oneida Limited, Noyes developed the company\'s ideology. He believed that "good wages were essential to good morale," and in 1904 proposed a policy of voluntary salary reductions for management whenever the company was in financial difficulties. The company followed this during economic troubles in 1921. Historian Maren Lockwood Carden wrote that, "Noyes halved his own salary, the directors took a one-third reduction, and the other officials took smaller ones in proportion to their regular salaries."Noyes also encouraged the development of Sherrill, New York as a community for employees. In 1905 the company laid out plans for the town, giving bonuses to those employees who built their own homes there. The company also helped to fund athletic clubs, a golf course, and the building of a new elementary school and a new high school.',
+                            "observation": "Lookup result",
+                            "answer": "",
+                            "external_tool_info": {
+                                "search_result": "",
+                                "lookup_result": "Lookup result",
+                            },
                             "is_correct": False,
                         },
                         {
@@ -319,6 +357,11 @@ def test_generate(expel_experiences_10_fake_path: str) -> None:
                             "action_type": "Finish",
                             "query": "Oneida Limited",
                             "observation": "Answer is CORRECT",
+                            "answer": "Oneida Limited",
+                            "external_tool_info": {
+                                "search_result": "",
+                                "lookup_result": "",
+                            },
                             "is_correct": True,
                         },
                     ],
@@ -363,7 +406,8 @@ def test_generate(expel_experiences_10_fake_path: str) -> None:
         mode={"qa": "hotpotqa"},
         experience_memory=ExpeLExperienceMemory(experiences),
     )
-
+    agent.reflexion_react_agent.strategy.docstore.search = lambda x: "Search result"
+    agent.reflexion_react_agent.strategy.docstore.lookup = lambda x: "Lookup result"
     out = agent.generate(question=question, key=key)
     assert out == gt_out
     assert len(agent.experience_memory.experiences["idxs"]) == 6
diff --git a/tests/cog/agent/test_react.py b/tests/cog/agent/test_react.py
index a1c3fefbb..f2744e9b0 100644
--- a/tests/cog/agent/test_react.py
+++ b/tests/cog/agent/test_react.py
@@ -40,6 +40,9 @@ def test_generate() -> None:
     ]
     llm = FakeListChatModel(responses=responses)
     agent = ReActAgent(llm=llm, mode={"qa": "hotpotqa"})
+    agent.strategy.docstore.search = (
+        lambda x: "Buakaw Banchamek has faced several controversies and legal issues."
+    )
 
     out = agent.generate(
         question=question,
diff --git a/tests/cog/agent/test_reflexion.py b/tests/cog/agent/test_reflexion.py
index 65520d946..3d7f5602e 100644
--- a/tests/cog/agent/test_reflexion.py
+++ b/tests/cog/agent/test_reflexion.py
@@ -291,6 +291,8 @@ def test_reflexion_react_generate() -> None:
         mode={"qa": "hotpotqa"},
         max_trials=1,
     )
+    agent.strategy.docstore.search = lambda x: "Search result"
+    agent.strategy.docstore.lookup = lambda x: "Lookup result"
     out = agent.generate(
         question=question,
         key=key,
@@ -333,6 +335,8 @@ def test_reflexion_react_generate() -> None:
     ]
     llm = FakeListChatModel(responses=responses)
     agent = ReflexionReActAgent(llm=llm, mode={"qa": "hotpotqa"}, max_trials=1)
+    agent.strategy.docstore.search = lambda x: "Search result"
+    agent.strategy.docstore.lookup = lambda x: "Lookup result"
     out = agent.generate(
         question=question,
         key=key,
@@ -403,6 +407,8 @@ def test_reflexion_react_generate() -> None:
         mode={"qa": "hotpotqa"},
         max_trials=2,
     )
+    agent.strategy.docstore.search = lambda x: "Search result"
+    agent.strategy.docstore.lookup = lambda x: "Lookup result"
     out = agent.generate(
         question=question,
         key=key,
@@ -446,6 +452,8 @@ def test_reflexion_react_generate() -> None:
     ]
     llm = FakeListChatModel(responses=responses)
     agent = ReflexionReActAgent(llm=llm, mode={"qa": "hotpotqa"})
+    agent.strategy.docstore.search = lambda x: "Search result"
+    agent.strategy.docstore.lookup = lambda x: "Lookup result"
     out = agent.generate(
         question=question,
         key=key,
@@ -480,6 +488,8 @@ def test_reflexion_react_generate() -> None:
     ]
     llm = FakeListChatModel(responses=responses)
     agent = ReflexionReActAgent(llm=llm, mode={"qa": "hotpotqa"})
+    agent.strategy.docstore.search = lambda x: "Search result"
+    agent.strategy.docstore.lookup = lambda x: "Lookup result"
     out = agent.generate(
         question=question,
         key=key,
@@ -501,6 +511,8 @@ def test_reflexion_react_generate() -> None:
 
     # In a subsequent run, answer correctly (reset defaults to True). Output is non-empty if patience is correctly reset.
     agent = ReflexionReActAgent(llm=llm, mode={"qa": "hotpotqa"})
+    agent.strategy.docstore.search = lambda x: "Search result"
+    agent.strategy.docstore.lookup = lambda x: "Lookup result"
     out = agent.generate(
         question=question,
         key=key,
diff --git a/tests/cog/eval/test_reflexion_eval.py b/tests/cog/eval/test_reflexion_eval.py
index 28c1a2fde..837d4dd91 100644
--- a/tests/cog/eval/test_reflexion_eval.py
+++ b/tests/cog/eval/test_reflexion_eval.py
@@ -5,6 +5,11 @@
 
 def test_em() -> None:
     """Test EM function."""
+    sample_answer = None
+    sample_key = None
+    result = EM(sample_answer, sample_key)
+    assert not result
+
     sample_answer = (
         "A fox jumped over the fence. An apple was on the table. The quick brown fox."
     )
@@ -22,3 +27,13 @@ def test_em() -> None:
     )
     result = EM(sample_answer, sample_key)
     assert not result
+
+    sample_answer = " A fox jumped over the fence. "
+    sample_key = "A fox jumped over the fence."
+    result = EM(sample_answer, sample_key, normalize=False)
+    assert not result
+
+    sample_answer = "A fox jumped over the fence."
+    sample_key = "A fox jumped over the fence."
+    result = EM(sample_answer, sample_key, normalize=False)
+    assert result
diff --git a/tests/cog/strategies/critic/test_code.py b/tests/cog/strategies/critic/test_code.py
index cd17851e6..a854404fc 100644
--- a/tests/cog/strategies/critic/test_code.py
+++ b/tests/cog/strategies/critic/test_code.py
@@ -90,7 +90,7 @@ def test_generate_critique() -> None:
     )
 
     assert critique == gt_critique
-    assert external_tool_info == {}
+    assert external_tool_info == {"execution_status": ""}
 
     # Test no tests error.
     with pytest.raises(ValueError):
@@ -138,7 +138,7 @@ def test_create_output_dict() -> None:
     result = strategy.create_output_dict(
         answer="", critique="", external_tool_info={"a": "b"}
     )
-    assert result == {"code": "", "critique": "", "a": "b"}
+    assert result == {"answer": "", "critique": "", "external_tool_info": {"a": "b"}}
 
 
 def test_update_answer_based_on_critique() -> None:
diff --git a/tests/cog/strategies/critic/test_math.py b/tests/cog/strategies/critic/test_math.py
index 4ee767496..9c3c8acce 100644
--- a/tests/cog/strategies/critic/test_math.py
+++ b/tests/cog/strategies/critic/test_math.py
@@ -74,7 +74,7 @@ def test_generate_critique() -> None:
     )
 
     assert result == gt_result
-    assert external_tool_info == {}
+    assert external_tool_info == {"execution_status": "", "code_answer": ""}
 
     # Test with tool.
     gt_result = "1. The revenue from selling eggs should be a positive number, -9867630 < 0, which is not reasonable.\n\n2. Let's check the code:\n\n- `total_eggs = 16` - This defines the total number of eggs laid by Janet's ducks per day.\n- `eaten_eggs = 3` - This represents the number of eggs Janet eats for breakfast.\n- `baked_eggs = 4933828` - This represents the number of eggs Janet uses to bake muffins for her friends daily.\n- `sold_eggs = total_eggs - eaten_eggs - baked_eggs` - This calculates the number of eggs Janet has left to sell at the farmers' market.\n- `dollars_per_egg = 2` - This represents the selling price of each fresh duck egg.\n- `answer = sold_eggs * dollars_per_egg` - This calculates the total revenue from selling eggs at the farmers' market.\n\nThe issue with the code is that the calculation for `sold_eggs` is incorrect. Janet should only sell the eggs that are left after she eats some for breakfast and uses some for baking. \n\n"
@@ -152,10 +152,15 @@ def test_create_output_dict() -> None:
 
     result = strategy.create_output_dict(answer, critique, external_tool_info)
 
-    assert result["code"] == answer
+    assert result["answer"] == answer
     assert result["critique"] == critique
-    assert result["execution_status"] == external_tool_info["execution_status"]
-    assert result["code_answer"] == external_tool_info["code_answer"]
+    assert (
+        result["external_tool_info"]["execution_status"]
+        == external_tool_info["execution_status"]
+    )
+    assert (
+        result["external_tool_info"]["code_answer"] == external_tool_info["code_answer"]
+    )
 
 
 def test_update_answer_based_on_critique() -> None:
diff --git a/tests/cog/strategies/critic/test_qa.py b/tests/cog/strategies/critic/test_qa.py
index c138dfe5a..0f07b4f7d 100644
--- a/tests/cog/strategies/critic/test_qa.py
+++ b/tests/cog/strategies/critic/test_qa.py
@@ -169,7 +169,7 @@ def test_generate_critique() -> None:
     )
 
     assert result == gt_result
-    assert external_tool_info == {}
+    assert external_tool_info == {"search_query": "", "search_result": ""}
     assert strategy._query_history == []
     assert strategy._evidence_history == set()
     assert strategy._halt
@@ -188,19 +188,19 @@ def test_create_output_dict() -> None:
 
     assert result["answer"] == "The capital of France is Paris."
     assert result["critique"] == "The answer is correct."
-    assert result["search_query"] == "capital of France"
-    assert result["search_result"] == "Paris"
+    assert result["external_tool_info"]["search_query"] == "capital of France"
+    assert result["external_tool_info"]["search_result"] == "Paris"
 
     strategy._halt = True
     result = strategy.create_output_dict(answer, critique, external_tool_info)
     assert "answer" in result
     assert "critique" in result
-    assert "search_query" in result
-    assert "search_result" in result
+    assert "search_query" in result["external_tool_info"]
+    assert "search_result" in result["external_tool_info"]
     assert result["answer"] == "The answer is correct."
     assert result["critique"] == "The answer is correct."
-    assert result["search_query"] == "capital of France"
-    assert result["search_result"] == "Paris"
+    assert result["external_tool_info"]["search_query"] == "capital of France"
+    assert result["external_tool_info"]["search_result"] == "Paris"
 
 
 def test_update_answer_based_on_critique() -> None:
diff --git a/tests/cog/strategies/react/test_code.py b/tests/cog/strategies/react/test_code.py
index fc3cc7604..8472f60f8 100644
--- a/tests/cog/strategies/react/test_code.py
+++ b/tests/cog/strategies/react/test_code.py
@@ -54,9 +54,9 @@ def test_init() -> None:
     strategy = ReActCodeStrategy(llm=llm)
     assert isinstance(strategy.llm, BaseChatModel)
     assert strategy.max_steps == 6
-    assert strategy.max_tokens == 3896
+    assert strategy.max_tokens == 5000
     assert isinstance(strategy.enc, Encoding)
-    assert strategy._current_answer == ""
+    assert strategy._answer == ""
     assert strategy._scratchpad == ""
     assert strategy._finished == False
 
@@ -115,11 +115,14 @@ def test_generate_observation() -> None:
     query = "def first_repeated_char(s):\n    char_set = set()\n    for char in s:\n        if char in char_set:\n            return char\n        else:\n            char_set.add(char)\n    return None"
     llm = FakeListChatModel(responses=[])
     strategy = ReActCodeStrategy(llm=llm)
-    obs = strategy.generate_observation(idx=0, action_type=action_type, query=query)
+    obs, external_tool_info = strategy.generate_observation(
+        idx=0, action_type=action_type, query=query
+    )
     assert obs == gt_obs
-    assert strategy._current_answer == query
+    assert strategy._answer == query
     assert strategy._finished is False
     assert strategy._scratchpad == gt_scratchpad
+    assert external_tool_info == {"execution_status": "Done"}
 
     # Test test.
     gt_obs = "\n```python\nprint('Hello World')\n\ndef first_repeated_char(s):\n    char_set = set()\n    for char in s:\n        if char in char_set:\n            return char\n        else:\n            char_set.add(char)\n    return None\n```\nExecution Status: Done"
@@ -128,12 +131,15 @@ def test_generate_observation() -> None:
     query = "def first_repeated_char(s):\n    char_set = set()\n    for char in s:\n        if char in char_set:\n            return char\n        else:\n            char_set.add(char)\n    return None"
     llm = FakeListChatModel(responses=[])
     strategy = ReActCodeStrategy(llm=llm)
-    strategy._current_answer = "print('Hello World')"
-    obs = strategy.generate_observation(idx=0, action_type=action_type, query=query)
+    strategy._answer = "print('Hello World')"
+    obs, external_tool_info = strategy.generate_observation(
+        idx=0, action_type=action_type, query=query
+    )
     assert obs == gt_obs
-    assert strategy._current_answer == "print('Hello World')"
+    assert strategy._answer == "print('Hello World')"
     assert strategy._finished is False
     assert strategy._scratchpad == gt_scratchpad
+    assert external_tool_info == {"execution_status": "Done"}
 
     # Test finish.
     gt_obs = "\n```python\ndef first_repeated_char(s):\n    char_set = set()\n    for char in s:\n        if char in char_set:\n            return char\n        else:\n            char_set.add(char)\n    return None\n```"
@@ -142,11 +148,14 @@ def test_generate_observation() -> None:
     query = "def first_repeated_char(s):\n    char_set = set()\n    for char in s:\n        if char in char_set:\n            return char\n        else:\n            char_set.add(char)\n    return None"
     llm = FakeListChatModel(responses=[])
     strategy = ReActCodeStrategy(llm=llm)
-    obs = strategy.generate_observation(idx=0, action_type=action_type, query=query)
+    obs, external_tool_info = strategy.generate_observation(
+        idx=0, action_type=action_type, query=query
+    )
     assert obs == gt_obs
-    assert strategy._current_answer == query
+    assert strategy._answer == query
     assert strategy._finished is True
     assert strategy._scratchpad == gt_scratchpad
+    assert external_tool_info == {"execution_status": "Done"}
 
     # Test error case.
     gt_scratchpad = "\nObservation 0: Invalid Action. Valid Actions are Implement[code] Test[code] and Finish[answer]."
@@ -154,14 +163,17 @@ def test_generate_observation() -> None:
     query = "def first_repeated_char(s):\n    char_set = set()\n    for char in s:\n        if char in char_set:\n            return char\n        else:\n            char_set.add(char)\n    return None"
     llm = FakeListChatModel(responses=[])
     strategy = ReActCodeStrategy(llm=llm)
-    obs = strategy.generate_observation(idx=0, action_type=action_type, query=query)
+    obs, external_tool_info = strategy.generate_observation(
+        idx=0, action_type=action_type, query=query
+    )
     assert (
         obs
         == "Invalid Action. Valid Actions are Implement[code] Test[code] and Finish[answer]."
     )
-    assert strategy._current_answer == ""
+    assert strategy._answer == ""
     assert strategy._finished is False
     assert strategy._scratchpad == gt_scratchpad
+    assert external_tool_info == {"execution_status": ""}
 
 
 def test_create_output_dict() -> None:
@@ -172,17 +184,21 @@ def test_create_output_dict() -> None:
     action_type = "implement"
     query = "def add(a, b): return a + b"
     obs = "Execution succeeded"
-    strategy._current_answer = "def add(a, b): return a + b"
+    strategy._answer = "def add(a, b): return a + b"
+    external_tool_info = {"execution_status": "Done"}
 
     expected_output = {
         "thought": thought,
         "action_type": action_type,
         "query": query,
         "observation": obs,
-        "answer": strategy._current_answer,
+        "answer": strategy._answer,
+        "external_tool_info": external_tool_info,
     }
 
-    output = strategy.create_output_dict(thought, action_type, query, obs)
+    output = strategy.create_output_dict(
+        thought, action_type, query, obs, external_tool_info
+    )
     assert output == expected_output
 
 
@@ -207,13 +223,13 @@ def test_reset() -> None:
     """Tests ReActCodeStrategy reset."""
     llm = FakeListChatModel(responses=[])
     strategy = ReActCodeStrategy(llm=llm)
-    strategy._current_answer = "def add(a, b): return a + b"
+    strategy._answer = "def add(a, b): return a + b"
     strategy._scratchpad = "Some scratchpad content"
     strategy._finished = True
 
     strategy.reset()
 
-    assert strategy._current_answer == ""
+    assert strategy._answer == ""
     assert strategy._scratchpad == ""
     assert not strategy._finished
 
diff --git a/tests/cog/strategies/react/test_math.py b/tests/cog/strategies/react/test_math.py
index 27c12ffcd..f7d287bf5 100644
--- a/tests/cog/strategies/react/test_math.py
+++ b/tests/cog/strategies/react/test_math.py
@@ -67,9 +67,9 @@ def test_init() -> None:
     strategy = ReActMathStrategy(llm=llm)
     assert isinstance(strategy.llm, BaseChatModel)
     assert strategy.max_steps == 6
-    assert strategy.max_tokens == 3896
+    assert strategy.max_tokens == 5000
     assert isinstance(strategy.enc, Encoding)
-    assert strategy._current_answer == ""
+    assert strategy._answer == ""
     assert strategy._scratchpad == ""
     assert strategy._finished == False
 
@@ -91,7 +91,7 @@ def test_generate() -> None:
         additional_keys={},
     )
     assert out == gt_out
-    assert strategy._current_answer == ""
+    assert strategy._answer == ""
     assert strategy._scratchpad == gt_scratchpad
     assert not strategy._finished
 
@@ -114,7 +114,7 @@ def test_generate_action() -> None:
     )
     assert action_type == "Calculate"
     assert query == gt_query
-    assert strategy._current_answer == ""
+    assert strategy._answer == ""
     assert strategy._scratchpad == gt_scratchpad
 
 
@@ -127,11 +127,14 @@ def test_generate_observation() -> None:
     query = "eggs_laid_per_day = 16\neggs_for_breakfast = 3\neggs_for_muffins = 4933828\neggs_used = eggs_for_breakfast + eggs_for_muffins\neggs_remaining = eggs_laid_per_day - eggs_used\nprice_per_egg = 2\nmoney_made_per_day = eggs_remaining * price_per_egg\nanswer = money_made_per_day"
     llm = FakeListChatModel(responses=[])
     strategy = ReActMathStrategy(llm=llm)
-    obs = strategy.generate_observation(idx=0, action_type=action_type, query=query)
+    obs, external_tool_info = strategy.generate_observation(
+        idx=0, action_type=action_type, query=query
+    )
     assert obs == gt_obs
-    assert strategy._current_answer == query
+    assert strategy._answer == query
     assert strategy._finished is False
     assert strategy._scratchpad == gt_scratchpad
+    assert external_tool_info == {"execution_status": "Done", "code_answer": -9867630}
 
     # Test Finish.
     gt_obs = "\n```python\neggs_laid_per_day = 16\neggs_for_breakfast = 3\neggs_for_muffins = 4933828\neggs_used = eggs_for_breakfast + eggs_for_muffins\neggs_remaining = eggs_laid_per_day - eggs_used\nprice_per_egg = 2\nmoney_made_per_day = eggs_remaining * price_per_egg\nanswer = money_made_per_day\n```"
@@ -140,11 +143,14 @@ def test_generate_observation() -> None:
     query = "eggs_laid_per_day = 16\neggs_for_breakfast = 3\neggs_for_muffins = 4933828\neggs_used = eggs_for_breakfast + eggs_for_muffins\neggs_remaining = eggs_laid_per_day - eggs_used\nprice_per_egg = 2\nmoney_made_per_day = eggs_remaining * price_per_egg\nanswer = money_made_per_day"
     llm = FakeListChatModel(responses=[])
     strategy = ReActMathStrategy(llm=llm)
-    obs = strategy.generate_observation(idx=0, action_type=action_type, query=query)
+    obs, external_tool_info = strategy.generate_observation(
+        idx=0, action_type=action_type, query=query
+    )
     assert obs == gt_obs
-    assert strategy._current_answer == query
+    assert strategy._answer == query
     assert strategy._finished is True
     assert strategy._scratchpad == gt_scratchpad
+    assert external_tool_info == {"execution_status": "Done", "code_answer": -9867630}
 
     # Test error case.
     gt_scratchpad = "\nObservation 0: Invalid Action. Valid Actions are Calculate[code] and Finish[answer]."
@@ -152,13 +158,16 @@ def test_generate_observation() -> None:
     query = "eggs_laid_per_day = 16\neggs_for_breakfast = 3\neggs_for_muffins = 4933828\neggs_used = eggs_for_breakfast + eggs_for_muffins\neggs_remaining = eggs_laid_per_day - eggs_used\nprice_per_egg = 2\nmoney_made_per_day = eggs_remaining * price_per_egg\nanswer = money_made_per_day"
     llm = FakeListChatModel(responses=[])
     strategy = ReActMathStrategy(llm=llm)
-    obs = strategy.generate_observation(idx=0, action_type=action_type, query=query)
+    obs, external_tool_info = strategy.generate_observation(
+        idx=0, action_type=action_type, query=query
+    )
     assert (
         obs == "Invalid Action. Valid Actions are Calculate[code] and Finish[answer]."
     )
-    assert strategy._current_answer == ""
+    assert strategy._answer == ""
     assert strategy._finished is False
     assert strategy._scratchpad == gt_scratchpad
+    assert external_tool_info == {"execution_status": "", "code_answer": ""}
 
 
 def test_create_output_dict() -> None:
@@ -171,17 +180,21 @@ def test_create_output_dict() -> None:
         "toys_initial = 5\ntoys_received = 2 + 2\nanswer = toys_initial + toys_received"
     )
     obs = "\n```python\ntoys_initial = 5\ntoys_received = 2 + 2\nanswer = toys_initial + toys_received\n```\nExecution Status: Done\nOutput: answer = 9"
+    external_tool_info = {"execution_status": "Done", "code_answer": ["9"]}
 
-    strategy._current_answer = "answer = 9"
+    strategy._answer = "answer = 9"
     expected_output = {
         "thought": thought,
         "action_type": action_type,
         "query": query,
         "observation": obs,
         "answer": "answer = 9",
+        "external_tool_info": external_tool_info,
     }
 
-    result = strategy.create_output_dict(thought, action_type, query, obs)
+    result = strategy.create_output_dict(
+        thought, action_type, query, obs, external_tool_info
+    )
     assert result == expected_output
 
 
@@ -214,13 +227,13 @@ def test_reset() -> None:
     """Tests ReActMathStrategy reset."""
     strategy = ReActMathStrategy(llm=FakeListChatModel(responses=[]))
 
-    strategy._current_answer = "answer = 9"
+    strategy._answer = "answer = 9"
     strategy._scratchpad = "Some scratchpad content"
     strategy._finished = True
 
     strategy.reset()
 
-    assert strategy._current_answer == ""
+    assert strategy._answer == ""
     assert strategy._scratchpad == ""
     assert strategy._finished == False
 
diff --git a/tests/cog/strategies/react/test_qa.py b/tests/cog/strategies/react/test_qa.py
index d41c84638..959a1e00e 100644
--- a/tests/cog/strategies/react/test_qa.py
+++ b/tests/cog/strategies/react/test_qa.py
@@ -44,7 +44,7 @@ def test_init() -> None:
     strategy = ReActQAStrategy(llm=llm)
     assert isinstance(strategy.llm, BaseChatModel)
     assert strategy.max_steps == 6
-    assert strategy.max_tokens == 3896
+    assert strategy.max_tokens == 5000
     assert isinstance(strategy.docstore, DocstoreExplorer)
     assert isinstance(strategy.enc, Encoding)
     assert strategy._scratchpad == ""
@@ -102,13 +102,25 @@ def test_generate_observation() -> None:
     init_scratchpad = "\nThought: I need to search for the best kickboxer in the world who has been involved in controversies and crimes.\nAction: Search[best kick boxer in the world controversies crimes]"
     responses = []
     llm = FakeListChatModel(responses=responses)
+
     strategy = ReActQAStrategy(llm=llm)
     strategy._scratchpad = init_scratchpad
     strategy._finished = False
-    obs = strategy.generate_observation(idx=1, action_type=action_type, query=query)
+    strategy.docstore.search = (
+        lambda x: "Buakaw Banchamek has faced several controversies and legal issues."
+    )
+    obs, external_tool_info = strategy.generate_observation(
+        idx=1, action_type=action_type, query=query
+    )
     assert isinstance(obs, str)
     assert strategy._finished == False
     assert strategy._scratchpad != init_scratchpad
+    assert "search_result" in external_tool_info
+    assert "lookup_result" in external_tool_info
+    assert (
+        external_tool_info["search_result"]
+        == "Buakaw Banchamek has faced several controversies and legal issues."
+    )
 
     # Test finish.
     action_type = "Finish"
@@ -119,11 +131,16 @@ def test_generate_observation() -> None:
     strategy = ReActQAStrategy(llm=llm)
     strategy._scratchpad = init_scratchpad
     strategy._finished = False
-    obs = strategy.generate_observation(idx=2, action_type=action_type, query=query)
+    obs, external_tool_info = strategy.generate_observation(
+        idx=2, action_type=action_type, query=query
+    )
     assert isinstance(obs, str)
     assert obs == "The best kickboxer is Buakaw Banchamek."
     assert strategy._finished == True
     assert strategy._scratchpad != init_scratchpad
+    assert "search_result" in external_tool_info
+    assert "lookup_result" in external_tool_info
+    assert external_tool_info == {"search_result": "", "lookup_result": ""}
 
     # Test search success.
     action_type = "Search"
@@ -137,11 +154,19 @@ def test_generate_observation() -> None:
     strategy.docstore.search = (
         lambda x: "Buakaw Banchamek has faced several controversies and legal issues."
     )
-    obs = strategy.generate_observation(idx=3, action_type=action_type, query=query)
+    obs, external_tool_info = strategy.generate_observation(
+        idx=3, action_type=action_type, query=query
+    )
     assert isinstance(obs, str)
     assert obs == "Buakaw Banchamek has faced several controversies and legal issues."
     assert strategy._finished == False
     assert strategy._scratchpad != init_scratchpad
+    assert "search_result" in external_tool_info
+    assert "lookup_result" in external_tool_info
+    assert (
+        external_tool_info["search_result"]
+        == "Buakaw Banchamek has faced several controversies and legal issues."
+    )
 
     # Test search failure.
     action_type = "Search"
@@ -155,11 +180,16 @@ def test_generate_observation() -> None:
     strategy.docstore.search = lambda x: (_ for _ in ()).throw(
         Exception("Search failed")
     )
-    obs = strategy.generate_observation(idx=4, action_type=action_type, query=query)
+    obs, external_tool_info = strategy.generate_observation(
+        idx=4, action_type=action_type, query=query
+    )
     assert isinstance(obs, str)
     assert obs == "Could not find that page, please try again."
     assert strategy._finished == False
     assert strategy._scratchpad != init_scratchpad
+    assert "search_result" in external_tool_info
+    assert "lookup_result" in external_tool_info
+    assert external_tool_info["search_result"] == ""
 
     # Test lookup success.
     action_type = "Lookup"
@@ -173,11 +203,16 @@ def test_generate_observation() -> None:
     strategy.docstore.lookup = (
         lambda x: "Several controversies and legal issues related to Buakaw Banchamek."
     )
-    obs = strategy.generate_observation(idx=5, action_type=action_type, query=query)
+    obs, external_tool_info = strategy.generate_observation(
+        idx=5, action_type=action_type, query=query
+    )
     assert isinstance(obs, str)
     assert obs == "Several controversies and legal issues related to Buakaw Banchamek."
     assert strategy._finished == False
     assert strategy._scratchpad != init_scratchpad
+    assert "search_result" in external_tool_info
+    assert "lookup_result" in external_tool_info
+    assert external_tool_info["lookup_result"] != ""
 
     # Test lookup failure.
     action_type = "Lookup"
@@ -191,7 +226,9 @@ def test_generate_observation() -> None:
     strategy.docstore.lookup = lambda x: (_ for _ in ()).throw(
         ValueError("Lookup failed")
     )
-    obs = strategy.generate_observation(idx=6, action_type=action_type, query=query)
+    obs, external_tool_info = strategy.generate_observation(
+        idx=6, action_type=action_type, query=query
+    )
     assert isinstance(obs, str)
     assert (
         obs
@@ -199,6 +236,9 @@ def test_generate_observation() -> None:
     )
     assert strategy._finished == False
     assert strategy._scratchpad != init_scratchpad
+    assert "search_result" in external_tool_info
+    assert "lookup_result" in external_tool_info
+    assert external_tool_info["lookup_result"] == ""
 
     # Test invalid action.
     action_type = "Invalid"
@@ -209,12 +249,18 @@ def test_generate_observation() -> None:
     strategy = ReActQAStrategy(llm=llm)
     strategy._scratchpad = init_scratchpad
     strategy._finished = False
-    obs = strategy.generate_observation(idx=7, action_type=action_type, query=query)
+    obs, external_tool_info = strategy.generate_observation(
+        idx=7, action_type=action_type, query=query
+    )
     assert isinstance(obs, str)
     assert (
         obs
         == "Invalid Action. Valid Actions are Lookup[<topic>] Search[<topic>] and Finish[<answer>]."
     )
+    assert "search_result" in external_tool_info
+    assert "lookup_result" in external_tool_info
+    assert external_tool_info["search_result"] == ""
+    assert external_tool_info["lookup_result"] == ""
 
 
 def test_create_output_dict() -> None:
@@ -225,16 +271,22 @@ def test_create_output_dict() -> None:
     action_type = "search"
     query = "query"
     obs = "observation"
+    external_tool_info = {"search_result": "", "lookup_result": ""}
 
     expected_output = {
         "thought": thought,
         "action_type": action_type,
         "query": query,
         "observation": obs,
+        "answer": "",
+        "external_tool_info": {"search_result": "", "lookup_result": ""},
     }
 
     assert (
-        strategy.create_output_dict(thought, action_type, query, obs) == expected_output
+        strategy.create_output_dict(
+            thought, action_type, query, obs, external_tool_info
+        )
+        == expected_output
     )
 
 
diff --git a/tests/cog/strategies/reflexion/test_math.py b/tests/cog/strategies/reflexion/test_math.py
new file mode 100644
index 000000000..866edfa51
--- /dev/null
+++ b/tests/cog/strategies/reflexion/test_math.py
@@ -0,0 +1,564 @@
+"""Unit tests for Reflexion Math strategies."""
+
+from langchain_community.chat_models.fake import FakeListChatModel
+from langchain_core.language_models.chat_models import BaseChatModel
+
+from agential.cog.modules.reflect.reflexion import (
+    ReflexionCoTReflector,
+    ReflexionReActReflector,
+)
+from agential.cog.prompts.agent.reflexion import (
+    GSM8K_FEWSHOT_EXAMPLES_REFLEXION_COT_REFLECT,
+    GSM8K_FEWSHOT_EXAMPLES_REFLEXION_REACT_REFLECT,
+    REFLEXION_COT_INSTRUCTION_GSM8K,
+    REFLEXION_COT_REFLECT_INSTRUCTION_GSM8K,
+    REFLEXION_REACT_INSTRUCTION_GSM8K,
+    REFLEXION_REACT_REFLECT_INSTRUCTION_GSM8K,
+)
+from agential.cog.prompts.benchmark.gsm8k import (
+    GSM8K_FEWSHOT_EXAMPLES_COT,
+    GSM8K_FEWSHOT_EXAMPLES_REACT,
+)
+from agential.cog.strategies.reflexion.math import (
+    ReflexionCoTGSM8KStrategy,
+    ReflexionCoTMathStrategy,
+    ReflexionCoTSVAMPStrategy,
+    ReflexionCoTTabMWPStrategy,
+    ReflexionReActMathStrategy,
+    ReflexionReActSVAMPStrategy,
+    ReflexionReActTabMWPStrategy,
+    parse_math_action_cot,
+    parse_math_action_react,
+)
+
+
+def test_parse_math_action_cot() -> None:
+    """Tests parse_math_action_cot."""
+    action = "Finish the calculation```python\nresult = 5 + 3\n```"
+    action_type, query = parse_math_action_cot(action)
+    assert action_type == "Finish"
+    assert query == "result = 5 + 3"
+
+    action = "complete the task```python\nanswer = 10 * 2\n```"
+    action_type, query = parse_math_action_cot(action)
+    assert action_type == ""
+    assert query == ""
+
+
+def test_parse_math_action_react() -> None:
+    """Tests parse_math_action_react."""
+    action = "Calculate the sum```python\nsum = 4 + 6\n```"
+    action_type, query = parse_math_action_react(action)
+    assert action_type == "Calculate"
+    assert query == "sum = 4 + 6"
+
+    action = "Finish the operation```python\nresult = 7 - 2\n```"
+    action_type, query = parse_math_action_react(action)
+    assert action_type == "Finish"
+    assert query == "result = 7 - 2"
+
+    action = "complete the task```python\noutput = 10 / 2\n```"
+    action_type, query = parse_math_action_react(action)
+    assert action_type == ""
+    assert query == ""
+
+
+def test_reflexion_cot_init() -> None:
+    """Tests ReflexionCoTMathStrategy init."""
+    llm = FakeListChatModel(responses=[])
+    strategy = ReflexionCoTGSM8KStrategy(llm=llm)
+    assert isinstance(strategy.llm, BaseChatModel)
+    assert isinstance(strategy.reflector, ReflexionCoTReflector)
+    assert strategy.max_reflections == 3
+    assert strategy.max_trials == 1
+    assert strategy._scratchpad == ""
+    assert strategy._finished == False
+    assert strategy._answer == ""
+
+
+def test_reflexion_cot_generate() -> None:
+    """Tests ReflexionCoTMathStrategy generate."""
+    question = "Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with 4933828. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"
+
+    gt_out = "Let's calculate the total number of eggs she sells after breakfast and baking muffins. Then, we can find out how much she makes daily at the farmers' market."
+    gt_scratchpad = "\nThought: Let's calculate the total number of eggs she sells after breakfast and baking muffins. Then, we can find out how much she makes daily at the farmers' market."
+    responses = [
+        "Let's calculate the total number of eggs she sells after breakfast and baking muffins. Then, we can find out how much she makes daily at the farmers' market.\nAction: Finish[\n```python\neggs_per_day = 16\neggs_for_breakfast = 3\neggs_for_muffins = 4933828\ntotal_eggs_sold = eggs_per_day - eggs_for_breakfast - eggs_for_muffins\nprice_per_egg = 2\ndaily_income = total_eggs_sold * price_per_egg\nanswer = daily_income\n```\n]"
+    ]
+    llm = FakeListChatModel(responses=responses)
+    strategy = ReflexionCoTMathStrategy(llm=llm)
+    out = strategy.generate(
+        question=question,
+        examples=GSM8K_FEWSHOT_EXAMPLES_COT,
+        reflections="",
+        prompt=REFLEXION_COT_INSTRUCTION_GSM8K,
+        additional_keys={},
+    )
+    assert out == gt_out
+    assert strategy._scratchpad == gt_scratchpad
+    assert strategy._finished == False
+    assert strategy._answer == ""
+
+
+def test_reflexion_cot_generate_action() -> None:
+    """Tests ReflexionCoTMathStrategy generate_action."""
+    question = "Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with 4933828. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"
+
+    responses = [
+        "Finish[\n```python\neggs_laid_per_day = 16\neggs_eaten_for_breakfast = 3\neggs_used_for_muffins = 4933828\neggs_sold = eggs_laid_per_day - eggs_eaten_for_breakfast - eggs_used_for_muffins\nprice_per_egg = 2\nmoney_made_per_day = eggs_sold * price_per_egg\nanswer = money_made_per_day\n```\n]"
+    ]
+    llm = FakeListChatModel(responses=responses)
+    strategy = ReflexionCoTMathStrategy(llm=llm)
+    action_type, query = strategy.generate_action(
+        question=question,
+        examples=GSM8K_FEWSHOT_EXAMPLES_COT,
+        reflections="",
+        prompt=REFLEXION_COT_INSTRUCTION_GSM8K,
+        additional_keys={},
+    )
+    assert action_type == "Finish"
+    assert (
+        query
+        == "eggs_laid_per_day = 16\neggs_eaten_for_breakfast = 3\neggs_used_for_muffins = 4933828\neggs_sold = eggs_laid_per_day - eggs_eaten_for_breakfast - eggs_used_for_muffins\nprice_per_egg = 2\nmoney_made_per_day = eggs_sold * price_per_egg\nanswer = money_made_per_day"
+    )
+    assert strategy._finished == False
+    assert strategy._answer == ""
+    assert (
+        strategy._scratchpad
+        == "\nAction: Finish[\n```python\neggs_laid_per_day = 16\neggs_eaten_for_breakfast = 3\neggs_used_for_muffins = 4933828\neggs_sold = eggs_laid_per_day - eggs_eaten_for_breakfast - eggs_used_for_muffins\nprice_per_egg = 2\nmoney_made_per_day = eggs_sold * price_per_egg\nanswer = money_made_per_day\n```\n]"
+    )
+
+
+def test_reflexion_cot_generate_observation() -> None:
+    """Tests ReflexionCoTMathStrategy generate_observation."""
+    # Case 1: action_type is "Finish" and answer is correct.
+    llm = FakeListChatModel(responses=[])
+    strategy = ReflexionCoTMathStrategy(llm=llm)
+    is_correct, obs = strategy.generate_observation(
+        action_type="Finish", query="correct_answer", key="correct_answer"
+    )
+    assert is_correct == False
+    assert obs == "Answer is INCORRECT"
+    assert "Observation: Answer is INCORRECT" in strategy._scratchpad
+
+    # Case 2: action_type is "Finish" and answer is incorrect.
+    strategy = ReflexionCoTMathStrategy(llm=llm)
+    is_correct, obs = strategy.generate_observation(
+        action_type="Finish", query="incorrect_answer", key="correct_answer"
+    )
+    assert is_correct == False
+    assert obs == "Answer is INCORRECT"
+    assert "Observation: Answer is INCORRECT" in strategy._scratchpad
+
+    # Case 3: action_type is not "Finish".
+    strategy = ReflexionCoTMathStrategy(llm=llm)
+    is_correct, obs = strategy.generate_observation(
+        action_type="Calculate", query="some_query", key="correct_answer"
+    )
+    assert is_correct == False
+    assert obs == "Invalid action type, please try again."
+    assert "Observation: Invalid action type, please try again." in strategy._scratchpad
+
+
+def test_reflexion_cot_create_output_dict() -> None:
+    """Tests ReflexionCoTMathStrategy create_output_dict."""
+    strategy = ReflexionCoTMathStrategy(llm=FakeListChatModel(responses=[]))
+
+    # Setting a dummy answer for testing.
+    strategy._answer = "correct_answer"
+
+    # Test case 1: Correct answer.
+    output = strategy.create_output_dict(
+        thought="This is a thought.",
+        action_type="Finish",
+        obs="Observation: Answer is CORRECT",
+        is_correct=True,
+        reflections=[],
+    )
+    expected_output = {
+        "thought": "This is a thought.",
+        "action_type": "Finish",
+        "observation": "Observation: Answer is CORRECT",
+        "answer": "correct_answer",
+        "is_correct": True,
+        "reflections": [],
+    }
+    assert output == expected_output
+
+    # Test case 2: Incorrect answer.
+    strategy._answer = "incorrect_answer"
+    output = strategy.create_output_dict(
+        thought="This is a thought.",
+        action_type="Finish",
+        obs="Observation: Answer is INCORRECT",
+        is_correct=False,
+        reflections=[],
+    )
+    expected_output = {
+        "thought": "This is a thought.",
+        "action_type": "Finish",
+        "observation": "Observation: Answer is INCORRECT",
+        "answer": "incorrect_answer",
+        "is_correct": False,
+        "reflections": [],
+    }
+    assert output == expected_output
+
+
+def test_reflexion_cot_halting_condition() -> None:
+    """Tests ReflexionCoTMathStrategy halting_condition."""
+    llm = FakeListChatModel(responses=[])
+    strategy = ReflexionCoTMathStrategy(llm=llm, max_trials=3)
+
+    strategy._answer = "incorrect_answer"
+    assert strategy.halting_condition(3, "correct_answer") == True
+
+    strategy._answer = "correct_answer"
+    assert strategy.halting_condition(2, "correct_answer") == False
+
+    strategy._answer = "incorrect_answer"
+    assert strategy.halting_condition(2, "correct_answer") == False
+
+
+def test_reflexion_cot_reset() -> None:
+    """Tests ReflexionCoTMathStrategy reset."""
+    llm = FakeListChatModel(responses=[])
+    strategy = ReflexionCoTMathStrategy(llm=llm, max_trials=3)
+
+    strategy._scratchpad = "Initial scratchpad content"
+    strategy._finished = True
+    strategy._answer = "Some answer"
+
+    # Test case 1: Reset everything.
+    strategy.reset()
+    assert strategy._scratchpad == ""
+    assert strategy._finished == False
+    assert strategy._answer == ""
+
+    strategy._scratchpad = "Initial scratchpad content"
+    strategy._finished = True
+    strategy._answer = "Some answer"
+
+    # Test case 2: Reset only scratchpad.
+    strategy.reset(only_scratchpad=True)
+    assert strategy._scratchpad == ""
+    assert strategy._finished == True
+    assert strategy._answer == "Some answer"
+
+
+def test_reflexion_cot_reflect() -> None:
+    """Tests ReflexionCoTMathStrategy reflect."""
+    question = "Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with 4933828. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"
+
+    llm = FakeListChatModel(responses=[])
+    strategy = ReflexionCoTMathStrategy(llm=llm, max_trials=3)
+
+    gt_out = "You have attempted to answer the following question before and failed. Below is the last trial you attempted to answer the question.\nQuestion: Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with 4933828. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\n\n(END PREVIOUS TRIAL)\n"
+    _, out = strategy.reflect(
+        reflect_strategy="last_attempt",
+        question=question,
+        examples=GSM8K_FEWSHOT_EXAMPLES_REFLEXION_COT_REFLECT,
+        prompt=REFLEXION_COT_REFLECT_INSTRUCTION_GSM8K,
+        additional_keys={},
+    )
+    assert out == gt_out
+
+
+def test_reflexion_cot_reflect_condition() -> None:
+    """Tests ReflexionCoTMathStrategy reflect_condition."""
+    llm = FakeListChatModel(responses=[])
+    strategy = ReflexionCoTMathStrategy(llm)
+
+    assert not strategy.reflect_condition(0, "strategy1", "key1")
+    assert strategy.reflect_condition(1, "strategy1", "key1")
+    assert strategy.reflect_condition(1, "strategy1", "key2")
+    assert strategy.reflect_condition(1, "", "key2")
+
+
+def test_reflexion_cot_instantiate_strategies() -> None:
+    """Tests ReflexionCoTMathStrategy instantiate strategies."""
+    llm = FakeListChatModel(responses=[])
+    gsm8k_strategy = ReflexionCoTGSM8KStrategy(llm=llm)
+    svamp_strategy = ReflexionCoTSVAMPStrategy(llm=llm)
+    tabmwp_strategy = ReflexionCoTTabMWPStrategy(llm=llm)
+
+    assert isinstance(gsm8k_strategy, ReflexionCoTGSM8KStrategy)
+    assert isinstance(svamp_strategy, ReflexionCoTSVAMPStrategy)
+    assert isinstance(tabmwp_strategy, ReflexionCoTTabMWPStrategy)
+
+
+def test_reflexion_react_init() -> None:
+    """Tests ReflexionReActMathStrategy init."""
+    llm = FakeListChatModel(responses=[])
+    strategy = ReflexionReActMathStrategy(llm=llm)
+    assert isinstance(strategy.llm, BaseChatModel)
+    assert isinstance(strategy.reflector, ReflexionReActReflector)
+    assert strategy.max_reflections == 3
+    assert strategy.max_trials == 1
+    assert strategy._scratchpad == ""
+    assert strategy._finished == False
+    assert strategy._answer == ""
+
+
+def test_reflexion_react_generate() -> None:
+    """Tests ReflexionReActMathStrategy generate."""
+    question = "Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with 4933828. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"
+
+    gt_scratchpad = "\nThought: I need to calculate how much money Janet makes at the farmers' market daily based on the number of fresh duck eggs she sells."
+    gt_out = "I need to calculate how much money Janet makes at the farmers' market daily based on the number of fresh duck eggs she sells."
+    responses = [
+        "I need to calculate how much money Janet makes at the farmers' market daily based on the number of fresh duck eggs she sells.\n\nAction: Calculate[\n```python\neggs_laid_per_day = 16\neggs_for_breakfast = 3\neggs_baked_into_muffins = 4933828\neggs_sold = eggs_laid_per_day - eggs_for_breakfast - eggs_baked_into_muffins\nprice_per_egg = 2\ntotal_money_daily = eggs_sold * price_per_egg\nanswer = total_money_daily\n```\n]\n\nObservation: \n```python\neggs_laid_per_day = 16\neggs_for_breakfast = 3\neggs_baked_into_muffins = 4933828\neggs_sold = eggs_laid_per_day - eggs_for_breakfast - eggs_baked_into_muffins\nprice_per_egg = 2\ntotal_money_daily = eggs_sold * price_per_egg\nanswer = total_money_daily\n```\nExecution Status: Done\nOutput: answer = -9867653\n\nThought: The answer is negative, which doesn't make sense. I must have made a mistake in the calculation.\n\nAction: Calculate[\n```python\neggs_laid_per_day = 16\neggs_for_breakfast = 3\neggs_baked_into_muffins = 4933828\neggs_sold = eggs_laid_per_day - eggs_for_breakfast - eggs_baked_into_muffins\nprice_per_egg = 2\ntotal_money_daily = eggs_sold * price_per_egg\ntotal_money_daily = abs(total_money_daily)  # Taking the absolute value\nanswer = total_money_daily\n```\n]\n\nObservation: \n```python\neggs_laid_per_day = 16\neggs_for_breakfast = 3\neggs_baked_into_muffins = 4933828\neggs_sold = eggs_laid_per_day - eggs_for_breakfast - eggs_baked_into_muffins\nprice_per_egg = 2\ntotal_money_daily = eggs_sold * price_per_egg\ntotal_money_daily = abs(total_money_daily)  # Taking the absolute value\nanswer = total_money_daily\n```\nExecution Status: Done\nOutput: answer = 9867646\n\nThought: Janet makes $9867646 every day at the farmers' market.\nAction: Finish[\n```python\nanswer = 9867646\n```\n]\n\nObservation: \n```python\nanswer = 9867646\n```"
+    ]
+    llm = FakeListChatModel(responses=responses)
+    strategy = ReflexionReActMathStrategy(llm=llm)
+    out = strategy.generate(
+        question=question,
+        examples=GSM8K_FEWSHOT_EXAMPLES_REACT,
+        reflections="",
+        prompt=REFLEXION_REACT_INSTRUCTION_GSM8K,
+        additional_keys={},
+        max_steps=5,
+    )
+    assert out == gt_out
+    assert strategy._scratchpad == gt_scratchpad
+
+
+def test_reflexion_react_generate_action() -> None:
+    """Tests ReflexionReActMathStrategy generate_action."""
+    question = "Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with 4933828. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"
+
+    gt_scratchpad = "\nAction: Calculate[\n```python\neggs_laid_per_day = 16\neggs_for_breakfast = 3\neggs_used_in_muffins = 4933828\neggs_sold = eggs_laid_per_day - eggs_for_breakfast - eggs_used_in_muffins\nprice_per_egg = 2\ndaily_income = eggs_sold * price_per_egg\nanswer = daily_income\n```\n]"
+    responses = [
+        "Calculate[\n```python\neggs_laid_per_day = 16\neggs_for_breakfast = 3\neggs_used_in_muffins = 4933828\neggs_sold = eggs_laid_per_day - eggs_for_breakfast - eggs_used_in_muffins\nprice_per_egg = 2\ndaily_income = eggs_sold * price_per_egg\nanswer = daily_income\n```\n]"
+    ]
+    llm = FakeListChatModel(responses=responses)
+    strategy = ReflexionReActMathStrategy(llm=llm)
+    action_type, query = strategy.generate_action(
+        question=question,
+        examples=GSM8K_FEWSHOT_EXAMPLES_REACT,
+        reflections="",
+        prompt=REFLEXION_REACT_INSTRUCTION_GSM8K,
+        additional_keys={},
+        max_steps=5,
+    )
+    assert action_type == "Calculate"
+    assert (
+        query
+        == "eggs_laid_per_day = 16\neggs_for_breakfast = 3\neggs_used_in_muffins = 4933828\neggs_sold = eggs_laid_per_day - eggs_for_breakfast - eggs_used_in_muffins\nprice_per_egg = 2\ndaily_income = eggs_sold * price_per_egg\nanswer = daily_income"
+    )
+    assert strategy._scratchpad == gt_scratchpad
+
+
+def test_reflexion_react_generate_observation() -> None:
+    """Tests ReflexionReActMathStrategy generate_observation."""
+    llm = FakeListChatModel(responses=[])
+    strategy = ReflexionReActMathStrategy(llm=llm)
+
+    # Test Calculate.
+    is_correct, obs, external_tool_info = strategy.generate_observation(
+        step_idx=1,
+        action_type="Calculate",
+        query="eggs_laid_per_day = 16\neggs_for_breakfast = 3\neggs_used_in_muffins = 4933828\neggs_sold = eggs_laid_per_day - eggs_for_breakfast - eggs_used_in_muffins\nprice_per_egg = 2\ndaily_income = eggs_sold * price_per_egg\nanswer = daily_income",
+        key=-9867630,
+    )
+    assert is_correct
+    assert (
+        obs
+        == "\n```python\neggs_laid_per_day = 16\neggs_for_breakfast = 3\neggs_used_in_muffins = 4933828\neggs_sold = eggs_laid_per_day - eggs_for_breakfast - eggs_used_in_muffins\nprice_per_egg = 2\ndaily_income = eggs_sold * price_per_egg\nanswer = daily_income\n```\nExecution Status: Done\nOutput: answer = -9867630"
+    )
+    assert external_tool_info == {"execution_status": "Done", "code_answer": -9867630}
+
+    # Test Finish incorrect.
+    is_correct, obs, external_tool_info = strategy.generate_observation(
+        step_idx=1,
+        action_type="Finish",
+        query="answer = 5",
+        key="key1",
+    )
+    assert not is_correct
+    assert obs == "Answer is INCORRECT"
+    assert strategy._scratchpad != ""
+    assert strategy._finished
+    assert strategy._answer == "answer = 5"
+    assert external_tool_info == {"code_answer": 5, "execution_status": "Done"}
+
+    # Test Finish correct.
+    is_correct, obs, external_tool_info = strategy.generate_observation(
+        step_idx=1,
+        action_type="Finish",
+        query="answer = 5",
+        key=5,
+    )
+    assert is_correct
+    assert obs == "Answer is CORRECT"
+    assert strategy._scratchpad != ""
+    assert strategy._finished
+    assert strategy._answer == "answer = 5"
+    assert external_tool_info == {"code_answer": 5, "execution_status": "Done"}
+
+    # Test invalid.
+    is_correct, obs, external_tool_info = strategy.generate_observation(
+        step_idx=1,
+        action_type="Invalid",
+        query="answer = 5",
+        key=5,
+    )
+    assert is_correct
+    assert (
+        obs == "Invalid Action. Valid Actions are Calculate[code] and Finish[answer]."
+    )
+    assert strategy._scratchpad != ""
+    assert strategy._finished
+    assert strategy._answer == "answer = 5"
+    assert external_tool_info == {"code_answer": "", "execution_status": ""}
+
+
+def test_reflexion_react_create_output_dict() -> None:
+    """Tests ReflexionReActMathStrategy create_output_dict."""
+    strategy = ReflexionReActMathStrategy(llm=FakeListChatModel(responses=[]))
+    react_out = [
+        {
+            "thought": "First thought",
+            "action_type": "Query",
+            "query": "What is the capital of France?",
+            "observation": "Observation: Answer is CORRECT",
+            "is_correct": True,
+        }
+    ]
+    reflections = "Reflection on the first thought."
+    output = strategy.create_output_dict(react_out, reflections)
+    expected_output = {
+        "react_output": react_out,
+        "reflections": reflections,
+    }
+    assert output == expected_output
+
+
+def test_reflexion_react_react_create_output_dict() -> None:
+    """Tests ReflexionReActMathStrategy react_create_output_dict."""
+    strategy = ReflexionReActMathStrategy(llm=FakeListChatModel(responses=[]))
+
+    # Test case 1: Valid output creation
+    output = strategy.react_create_output_dict(
+        thought="Initial thought",
+        action_type="Query",
+        query="What is the capital of France?",
+        obs="Observation: Answer is CORRECT",
+        external_tool_info={"search_result": "", "lookup_result": ""},
+        is_correct=True,
+    )
+    expected_output = {
+        "thought": "Initial thought",
+        "action_type": "Query",
+        "query": "What is the capital of France?",
+        "observation": "Observation: Answer is CORRECT",
+        "answer": "",
+        "external_tool_info": {"search_result": "", "lookup_result": ""},
+        "is_correct": True,
+    }
+    assert output == expected_output
+
+
+def test_reflexion_react_halting_condition() -> None:
+    """Tests ReflexionReActMathStrategy halting_condition."""
+    llm = FakeListChatModel(responses=[])
+
+    # Test case 1: Halting condition met because answer is incorrect and index is less than max_trials.
+    strategy = ReflexionReActMathStrategy(llm=llm, max_trials=5)
+    strategy._answer = "incorrect_answer"
+    assert strategy.halting_condition(3, "correct_answer") == True
+
+    # Test case 2: Halting condition not met because answer is correct.
+    strategy = ReflexionReActMathStrategy(llm=llm, max_trials=5)
+    strategy._answer = "correct_answer"
+    assert strategy.halting_condition(3, "correct_answer") == True
+
+    # Test case 3: Halting condition not met because index is greater than or equal to max_trials.
+    strategy = ReflexionReActMathStrategy(llm=llm, max_trials=3)
+    strategy._answer = "incorrect_answer"
+    assert strategy.halting_condition(4, "correct_answer") == False
+
+    # Test case 4: Halting condition met using max_trials from kwargs.
+    strategy = ReflexionReActMathStrategy(llm=llm, max_trials=5)
+    strategy._answer = "incorrect_answer"
+    assert strategy.halting_condition(3, "correct_answer", max_trials=4) == True
+
+    # Test case 5: Halting condition not met using max_trials from kwargs.
+    strategy = ReflexionReActMathStrategy(llm=llm, max_trials=5)
+    strategy._answer = "incorrect_answer"
+    assert strategy.halting_condition(4, "correct_answer", max_trials=3) == False
+
+
+def test_reflexion_react_react_halting_condition() -> None:
+    """Tests ReflexionReActMathStrategy react_halting_condition."""
+    strategy = ReflexionReActMathStrategy(llm=FakeListChatModel(responses=[]))
+
+    idx = 0
+    question = "What is the capital of France?"
+    examples = ""
+    reflections = ""
+    prompt = "Answer the question."
+
+    assert not strategy.react_halting_condition(
+        idx, question, examples, reflections, prompt, {}
+    )
+
+
+def test_reflexion_react_reset() -> None:
+    """Tests ReflexionReActMathStrategy reset."""
+    llm = FakeListChatModel(responses=[])
+    strategy = ReflexionReActMathStrategy(llm=llm)
+    strategy._scratchpad = "Some previous state"
+    strategy._finished = True
+
+    strategy.reset()
+
+    assert strategy._scratchpad == ""
+    assert not strategy._finished
+
+
+def test_reflexion_react_reflect() -> None:
+    """Tests ReflexionReActMathStrategy reflect."""
+    question = "Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with 4933828. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"
+
+    gt_reflections = "You have attempted to answer following question before and failed. The following reflection(s) give a plan to avoid failing to answer the question in the same way you did previously. Use them to improve your strategy of correctly answering the given question.\nReflections:\n- 1"
+    llm = FakeListChatModel(responses=["1"])
+    strategy = ReflexionReActMathStrategy(llm=llm)
+    _, reflections = strategy.reflect(
+        reflect_strategy="reflexion",
+        question=question,
+        examples=GSM8K_FEWSHOT_EXAMPLES_REFLEXION_REACT_REFLECT,
+        prompt=REFLEXION_REACT_REFLECT_INSTRUCTION_GSM8K,
+        additional_keys={},
+    )
+    assert reflections == gt_reflections
+
+
+def test_reflexion_react_reflect_condition() -> None:
+    """Tests ReflexionReActMathStrategy reflect_condition."""
+    question = "Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with 4933828. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"
+
+    llm = FakeListChatModel(responses=["1"])
+    strategy = ReflexionReActMathStrategy(llm=llm)
+    out = strategy.reflect_condition(
+        step_idx=1,
+        reflect_strategy="reflexion",
+        question=question,
+        examples=GSM8K_FEWSHOT_EXAMPLES_REFLEXION_REACT_REFLECT,
+        key="key",
+        prompt=REFLEXION_REACT_REFLECT_INSTRUCTION_GSM8K,
+        additional_keys={},
+    )
+    assert not out
+
+
+def test_reflexion_react_instantiate_strategies() -> None:
+    """Tests ReflexionReActMathStrategy instantiate strategies."""
+    llm = FakeListChatModel(responses=[])
+    gsm8k_strategy = ReflexionReActMathStrategy(llm=llm)
+    svamp_strategy = ReflexionReActSVAMPStrategy(llm=llm)
+    tabmwp_strategy = ReflexionReActTabMWPStrategy(llm=llm)
+
+    assert isinstance(gsm8k_strategy, ReflexionReActMathStrategy)
+    assert isinstance(svamp_strategy, ReflexionReActSVAMPStrategy)
+    assert isinstance(tabmwp_strategy, ReflexionReActTabMWPStrategy)
diff --git a/tests/cog/strategies/reflexion/test_qa.py b/tests/cog/strategies/reflexion/test_qa.py
index 524d9bb71..02d29ed13 100644
--- a/tests/cog/strategies/reflexion/test_qa.py
+++ b/tests/cog/strategies/reflexion/test_qa.py
@@ -30,9 +30,38 @@
     ReflexionReActHotQAStrategy,
     ReflexionReActQAStrategy,
     ReflexionReActTriviaQAStrategy,
+    parse_qa_action,
 )
 
 
+def test_parse_qa_action() -> None:
+    """Tests parse_qa_action."""
+    action = "Calculate[sum = 4 + 6]"
+    action_type, argument = parse_qa_action(action)
+    assert action_type == "Calculate"
+    assert argument == "sum = 4 + 6"
+
+    action = "Finish[result = 7 - 2]"
+    action_type, argument = parse_qa_action(action)
+    assert action_type == "Finish"
+    assert argument == "result = 7 - 2"
+
+    action = "InvalidAction[result = 10 / 2]"
+    action_type, argument = parse_qa_action(action)
+    assert action_type == "InvalidAction"
+    assert argument == "result = 10 / 2"
+
+    action = "NoBrackets"
+    action_type, argument = parse_qa_action(action)
+    assert action_type == ""
+    assert argument == ""
+
+    action = "EmptyBrackets[]"
+    action_type, argument = parse_qa_action(action)
+    assert action_type == ""
+    assert argument == ""
+
+
 def test_reflexion_cot_init() -> None:
     """Test ReflexionCoTQAStrategy initialization."""
     llm = FakeListChatModel(responses=[])
@@ -143,7 +172,7 @@ def test_reflexion_cot_create_output_dict() -> None:
     expected_output = {
         "thought": "This is a thought.",
         "action_type": "Finish",
-        "obs": "Observation: Answer is CORRECT",
+        "observation": "Observation: Answer is CORRECT",
         "answer": "correct_answer",
         "is_correct": True,
         "reflections": [],
@@ -162,7 +191,7 @@ def test_reflexion_cot_create_output_dict() -> None:
     expected_output = {
         "thought": "This is a thought.",
         "action_type": "Finish",
-        "obs": "Observation: Answer is INCORRECT",
+        "observation": "Observation: Answer is INCORRECT",
         "answer": "incorrect_answer",
         "is_correct": False,
         "reflections": [],
@@ -181,7 +210,7 @@ def test_reflexion_cot_create_output_dict() -> None:
     expected_output = {
         "thought": "This is another thought.",
         "action_type": "Calculate",
-        "obs": "Observation: Invalid action type, please try again.",
+        "observation": "Observation: Invalid action type, please try again.",
         "answer": "some_answer",
         "is_correct": False,
         "reflections": [],
@@ -336,7 +365,8 @@ def test_reflexion_react_generate_observation() -> None:
     """Tests ReflexionReActQAStrategy generate_observation."""
     llm = FakeListChatModel(responses=[])
     strategy = ReflexionReActQAStrategy(llm=llm)
-    is_correct, obs = strategy.generate_observation(
+    strategy.docstore.search = lambda x: "Search result"
+    is_correct, obs, external_tool_info = strategy.generate_observation(
         step_idx=1,
         action_type="Search",
         query="VIVA Media AG",
@@ -347,8 +377,10 @@ def test_reflexion_react_generate_observation() -> None:
     assert strategy._scratchpad != ""
     assert not strategy._finished
     assert strategy._answer == ""
+    assert external_tool_info == {"search_result": "Search result", "lookup_result": ""}
 
-    is_correct, obs = strategy.generate_observation(
+    strategy.docstore.lookup = lambda x: "Lookup result"
+    is_correct, obs, external_tool_info = strategy.generate_observation(
         step_idx=1,
         action_type="Lookup",
         query="VIVA Media AG",
@@ -359,8 +391,9 @@ def test_reflexion_react_generate_observation() -> None:
     assert strategy._scratchpad != ""
     assert not strategy._finished
     assert strategy._answer == ""
+    assert external_tool_info == {"search_result": "", "lookup_result": "Lookup result"}
 
-    is_correct, obs = strategy.generate_observation(
+    is_correct, obs, external_tool_info = strategy.generate_observation(
         step_idx=1,
         action_type="Finish",
         query="VIVA Media AG",
@@ -371,6 +404,7 @@ def test_reflexion_react_generate_observation() -> None:
     assert strategy._scratchpad != ""
     assert strategy._finished
     assert strategy._answer == "VIVA Media AG"
+    assert external_tool_info == {"search_result": "", "lookup_result": ""}
 
 
 def test_reflexion_react_create_output_dict() -> None:
@@ -441,6 +475,7 @@ def test_reflexion_react_react_create_output_dict() -> None:
         action_type="Query",
         query="What is the capital of France?",
         obs="Observation: Answer is CORRECT",
+        external_tool_info={"search_result": "", "lookup_result": ""},
         is_correct=True,
     )
     expected_output = {
@@ -448,6 +483,8 @@ def test_reflexion_react_react_create_output_dict() -> None:
         "action_type": "Query",
         "query": "What is the capital of France?",
         "observation": "Observation: Answer is CORRECT",
+        "answer": "",
+        "external_tool_info": {"search_result": "", "lookup_result": ""},
         "is_correct": True,
     }
     assert output == expected_output
@@ -458,6 +495,7 @@ def test_reflexion_react_react_create_output_dict() -> None:
         action_type="Validate",
         query="Is 2+2=4?",
         obs="Observation: Answer is CORRECT",
+        external_tool_info={"search_result": "", "lookup_result": ""},
         is_correct=True,
     )
     expected_output = {
@@ -465,6 +503,8 @@ def test_reflexion_react_react_create_output_dict() -> None:
         "action_type": "Validate",
         "query": "Is 2+2=4?",
         "observation": "Observation: Answer is CORRECT",
+        "answer": "",
+        "external_tool_info": {"search_result": "", "lookup_result": ""},
         "is_correct": True,
     }
     assert output == expected_output
@@ -475,6 +515,7 @@ def test_reflexion_react_react_create_output_dict() -> None:
         action_type="Answer",
         query="What is the square root of 16?",
         obs="Observation: Answer is INCORRECT",
+        external_tool_info={"search_result": "", "lookup_result": ""},
         is_correct=False,
     )
     expected_output = {
@@ -482,6 +523,8 @@ def test_reflexion_react_react_create_output_dict() -> None:
         "action_type": "Answer",
         "query": "What is the square root of 16?",
         "observation": "Observation: Answer is INCORRECT",
+        "answer": "",
+        "external_tool_info": {"search_result": "", "lookup_result": ""},
         "is_correct": False,
     }
     assert output == expected_output
diff --git a/tests/cog/strategies/test_strategy_factory.py b/tests/cog/strategies/test_strategy_factory.py
index ac8413518..f97ffb5b4 100644
--- a/tests/cog/strategies/test_strategy_factory.py
+++ b/tests/cog/strategies/test_strategy_factory.py
@@ -34,6 +34,14 @@
     ReActHotQAStrategy,
     ReActTriviaQAStrategy,
 )
+from agential.cog.strategies.reflexion.math import (
+    ReflexionCoTGSM8KStrategy,
+    ReflexionCoTSVAMPStrategy,
+    ReflexionCoTTabMWPStrategy,
+    ReflexionReActGSM8KStrategy,
+    ReflexionReActSVAMPStrategy,
+    ReflexionReActTabMWPStrategy,
+)
 from agential.cog.strategies.reflexion.qa import (
     ReflexionCoTAmbigNQStrategy,
     ReflexionCoTFEVERStrategy,
@@ -235,6 +243,20 @@ def test_reflexioncot_strategy_factory_get_strategy() -> None:
         ReflexionCoTFEVERStrategy,
     )
 
+    # Math benchmarks.
+    assert isinstance(
+        ReflexionCoTStrategyFactory.get_strategy({"math": "gsm8k"}, llm=llm),
+        ReflexionCoTGSM8KStrategy,
+    )
+    assert isinstance(
+        ReflexionCoTStrategyFactory.get_strategy({"math": "svamp"}, llm=llm),
+        ReflexionCoTSVAMPStrategy,
+    )
+    assert isinstance(
+        ReflexionCoTStrategyFactory.get_strategy({"math": "tabmwp"}, llm=llm),
+        ReflexionCoTTabMWPStrategy,
+    )
+
     # Test kwargs for QA strategy.
     strategy = ReflexionCoTStrategyFactory.get_strategy(
         {"qa": "hotpotqa"}, llm=llm, max_reflections=1
@@ -279,6 +301,20 @@ def test_reflexionreact_strategy_factory_get_strategy() -> None:
         ReflexionReActFEVERStrategy,
     )
 
+    # Math benchmarks.
+    assert isinstance(
+        ReflexionReActStrategyFactory.get_strategy({"math": "gsm8k"}, llm=llm),
+        ReflexionReActGSM8KStrategy,
+    )
+    assert isinstance(
+        ReflexionReActStrategyFactory.get_strategy({"math": "svamp"}, llm=llm),
+        ReflexionReActSVAMPStrategy,
+    )
+    assert isinstance(
+        ReflexionReActStrategyFactory.get_strategy({"math": "tabmwp"}, llm=llm),
+        ReflexionReActTabMWPStrategy,
+    )
+
     # Test kwargs for QA strategy.
     strategy = ReflexionReActStrategyFactory.get_strategy(
         {"qa": "hotpotqa"}, llm=llm, max_reflections=1