diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 643e1faeeb..1e67f329f0 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -9228,11 +9228,21 @@
                 "type": "object",
                 "properties": {
                     "tool_responses": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/ToolResponseMessage"
-                        },
-                        "description": "The tool call responses to resume the turn with."
+                        "oneOf": [
+                            {
+                                "type": "array",
+                                "items": {
+                                    "$ref": "#/components/schemas/ToolResponse"
+                                }
+                            },
+                            {
+                                "type": "array",
+                                "items": {
+                                    "$ref": "#/components/schemas/ToolResponseMessage"
+                                }
+                            }
+                        ],
+                        "description": "The tool call responses to resume the turn with. NOTE: ToolResponseMessage will be deprecated. Use ToolResponse."
                     },
                     "stream": {
                         "type": "boolean",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index eb31b61fb7..cebba8368a 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -6153,11 +6153,16 @@ components:
       type: object
       properties:
         tool_responses:
-          type: array
-          items:
-            $ref: '#/components/schemas/ToolResponseMessage'
+          oneOf:
+            - type: array
+              items:
+                $ref: '#/components/schemas/ToolResponse'
+            - type: array
+              items:
+                $ref: '#/components/schemas/ToolResponseMessage'
           description: >-
-            The tool call responses to resume the turn with.
+            The tool call responses to resume the turn with. NOTE: ToolResponseMessage
+            will be deprecated. Use ToolResponse.
         stream:
           type: boolean
           description: Whether to stream the response.
diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py
index eb33997887..79df0fa40c 100644
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@@ -302,7 +302,7 @@ class AgentTurnResumeRequest(BaseModel):
     agent_id: str
     session_id: str
     turn_id: str
-    tool_responses: List[ToolResponseMessage]
+    tool_responses: Union[List[ToolResponse], List[ToolResponseMessage]]
     stream: Optional[bool] = False
 
 
@@ -363,7 +363,7 @@ async def resume_agent_turn(
         agent_id: str,
         session_id: str,
         turn_id: str,
-        tool_responses: List[ToolResponseMessage],
+        tool_responses: Union[List[ToolResponse], List[ToolResponseMessage]],
         stream: Optional[bool] = False,
     ) -> Union[Turn, AsyncIterator[AgentTurnResponseStreamChunk]]:
         """Resume an agent turn with executed tool call responses.
@@ -374,6 +374,7 @@ async def resume_agent_turn(
         :param session_id: The ID of the session to resume.
         :param turn_id: The ID of the turn to resume.
         :param tool_responses: The tool call responses to resume the turn with.
+            NOTE: ToolResponseMessage will be deprecated. Use ToolResponse.
         :param stream: Whether to stream the response.
         :returns: A Turn object if stream is False, otherwise an AsyncIterator of AgentTurnResponseStreamChunk objects.
         """
diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
index f868bee2cd..720e73503b 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@@ -216,13 +216,25 @@ async def _run_turn(
         steps = []
         messages = await self.get_messages_from_turns(turns)
         if is_resume:
-            messages.extend(request.tool_responses)
+            if isinstance(request.tool_responses[0], ToolResponseMessage):
+                tool_response_messages = request.tool_responses
+                tool_responses = [
+                    ToolResponse(call_id=x.call_id, tool_name=x.tool_name, content=x.content)
+                    for x in request.tool_responses
+                ]
+            else:
+                tool_response_messages = [
+                    ToolResponseMessage(call_id=x.call_id, tool_name=x.tool_name, content=x.content)
+                    for x in request.tool_responses
+                ]
+                tool_responses = request.tool_responses
+            messages.extend(tool_response_messages)
             last_turn = turns[-1]
             last_turn_messages = self.turn_to_messages(last_turn)
             last_turn_messages = [
                 x for x in last_turn_messages if isinstance(x, UserMessage) or isinstance(x, ToolResponseMessage)
             ]
-            last_turn_messages.extend(request.tool_responses)
+            last_turn_messages.extend(tool_response_messages)
 
             # get steps from the turn
             steps = last_turn.steps
@@ -238,14 +250,7 @@ async def _run_turn(
                 step_id=(in_progress_tool_call_step.step_id if in_progress_tool_call_step else str(uuid.uuid4())),
                 turn_id=request.turn_id,
                 tool_calls=(in_progress_tool_call_step.tool_calls if in_progress_tool_call_step else []),
-                tool_responses=[
-                    ToolResponse(
-                        call_id=x.call_id,
-                        tool_name=x.tool_name,
-                        content=x.content,
-                    )
-                    for x in request.tool_responses
-                ],
+                tool_responses=tool_responses,
                 completed_at=now,
                 started_at=(in_progress_tool_call_step.started_at if in_progress_tool_call_step else now),
             )
diff --git a/llama_stack/providers/inline/agents/meta_reference/agents.py b/llama_stack/providers/inline/agents/meta_reference/agents.py
index db33bca4ac..a46fa8eb70 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agents.py
@@ -27,6 +27,7 @@
 from llama_stack.apis.inference import (
     Inference,
     ToolConfig,
+    ToolResponse,
     ToolResponseMessage,
     UserMessage,
 )
@@ -168,7 +169,7 @@ async def resume_agent_turn(
         agent_id: str,
         session_id: str,
         turn_id: str,
-        tool_responses: List[ToolResponseMessage],
+        tool_responses: Union[List[ToolResponse], List[ToolResponseMessage]],
         stream: Optional[bool] = False,
     ) -> AsyncGenerator:
         request = AgentTurnResumeRequest(
diff --git a/tests/integration/agents/test_agents.py b/tests/integration/agents/test_agents.py
index f221582c8f..277b37448a 100644
--- a/tests/integration/agents/test_agents.py
+++ b/tests/integration/agents/test_agents.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from typing import Any, Dict
 from uuid import uuid4
 
 import pytest
@@ -40,6 +41,25 @@ def get_boiling_point(liquid_name: str, celcius: bool = True) -> int:
         return -1
 
 
+@client_tool
+def get_boiling_point_with_metadata(liquid_name: str, celcius: bool = True) -> Dict[str, Any]:
+    """
+    Returns the boiling point of a liquid in Celcius or Fahrenheit
+
+    :param liquid_name: The name of the liquid
+    :param celcius: Whether to return the boiling point in Celcius
+    :return: The boiling point of the liquid in Celcius or Fahrenheit
+    """
+    if liquid_name.lower() == "polyjuice":
+        if celcius:
+            temp = -100
+        else:
+            temp = -212
+    else:
+        temp = -1
+    return {"content": temp, "metadata": {"source": "https://www.google.com"}}
+
+
 @pytest.fixture(scope="session")
 def agent_config(llama_stack_client_with_mocked_inference, text_model_id):
     available_shields = [shield.identifier for shield in llama_stack_client_with_mocked_inference.shields.list()]
@@ -551,8 +571,9 @@ def test_rag_and_code_agent(llama_stack_client_with_mocked_inference, agent_conf
             assert expected_kw in response.output_message.content.lower()
 
 
-def test_create_turn_response(llama_stack_client_with_mocked_inference, agent_config):
-    client_tool = get_boiling_point
+@pytest.mark.parametrize("client_tools", [(get_boiling_point, False), (get_boiling_point_with_metadata, True)])
+def test_create_turn_response(llama_stack_client_with_mocked_inference, agent_config, client_tools):
+    client_tool, expectes_metadata = client_tools
     agent_config = {
         **agent_config,
         "input_shields": [],
@@ -577,7 +598,9 @@ def test_create_turn_response(llama_stack_client_with_mocked_inference, agent_co
     assert len(steps) == 3
     assert steps[0].step_type == "inference"
     assert steps[1].step_type == "tool_execution"
-    assert steps[1].tool_calls[0].tool_name == "get_boiling_point"
+    assert steps[1].tool_calls[0].tool_name.startswith("get_boiling_point")
+    if expectes_metadata:
+        assert steps[1].tool_responses[0].metadata["source"] == "https://www.google.com"
     assert steps[2].step_type == "inference"
 
     last_step_completed_at = None
diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.json b/tests/integration/fixtures/recorded_responses/chat_completion.json
index 4b0d9b1c10..9e70e3df01 100644
--- a/tests/integration/fixtures/recorded_responses/chat_completion.json
+++ b/tests/integration/fixtures/recorded_responses/chat_completion.json
@@ -102,7 +102,22 @@
       {
         "event": {
           "delta": {
-            "text": " boiling point of polyjuice is -100 degrees Fahrenheit.",
+            "text": " boiling point of polyjuice is -100 degrees",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " Fahrenheit.",
             "type": "text"
           },
           "event_type": {
@@ -312,7 +327,22 @@
       {
         "event": {
           "delta": {
-            "text": "type\": \"function\", \"name\": \"get_boiling_point",
+            "text": "type\": \"function\", \"name\": \"",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "get_boiling_point\", \"parameters",
             "type": "text"
           },
           "event_type": {
@@ -327,7 +357,7 @@
       {
         "event": {
           "delta": {
-            "text": "\", \"parameters\": {\"liquid_name\": \"polyjuice\",",
+            "text": "\": {\"liquid_name\": \"polyjuice\", \"",
             "type": "text"
           },
           "event_type": {
@@ -342,7 +372,7 @@
       {
         "event": {
           "delta": {
-            "text": " \"celcius\": \"false\"}}",
+            "text": "celcius\": \"false\"}}",
             "type": "text"
           },
           "event_type": {
@@ -366,7 +396,7 @@
                 "celcius": "false",
                 "liquid_name": "polyjuice"
               },
-              "call_id": "b9ded2e6-bef1-40bc-8a5b-a8c1018d0ba2",
+              "call_id": "00c0968b-d7d4-450d-a6ff-03d64ae9f772",
               "tool_name": "get_boiling_point"
             },
             "type": "tool_call"
@@ -590,45 +620,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "{\"type\": \"function\", \"",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "name\": \"get_boiling_point\",",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " \"parameters\": {\"liquid_name\": \"polyju",
+            "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
             "type": "tool_call"
           },
           "event_type": {
@@ -647,7 +639,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "ice\", \"celcius\":",
+            "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"polyjuice",
             "type": "tool_call"
           },
           "event_type": {
@@ -666,7 +658,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " \"true\"}}",
+            "tool_call": "\", \"celcius\": \"true\"}}",
             "type": "tool_call"
           },
           "event_type": {
@@ -690,7 +682,7 @@
                 "celcius": "true",
                 "liquid_name": "polyjuice"
               },
-              "call_id": "98c011b5-f5de-416e-9a06-c2e3d0fa5581",
+              "call_id": "eda85f20-da80-4e11-a0e4-3849159ae70f",
               "tool_name": "get_boiling_point"
             },
             "type": "tool_call"
@@ -831,7 +823,7 @@
       {
         "event": {
           "delta": {
-            "text": " boiling point of polyjuice is -100\u00b0C",
+            "text": " boiling point of polyjuice is -100\u00b0C.",
             "type": "text"
           },
           "event_type": {
@@ -846,7 +838,60 @@
       {
         "event": {
           "delta": {
-            "text": ".",
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Call get_boiling_point and answer What is the boiling point of polyjuice?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='get_boiling_point_with_metadata', arguments={'liquid_name': 'polyjuice', 'celcius': 'true'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='get_boiling_point_with_metadata', content='-100')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point_with_metadata', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "The",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " boiling point of polyjuice is -100\u00b0C.",
             "type": "text"
           },
           "event_type": {
@@ -1103,7 +1148,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "\": {\"liquid_name\": \"polyjuice\", \"celci",
+            "tool_call": "\": {\"liquid_name\": \"poly",
             "type": "tool_call"
           },
           "event_type": {
@@ -1122,7 +1167,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "us\": \"true\"}}",
+            "tool_call": "juice\", \"celcius\": \"true\"}}",
             "type": "tool_call"
           },
           "event_type": {
@@ -1146,7 +1191,7 @@
                 "celcius": "true",
                 "liquid_name": "polyjuice"
               },
-              "call_id": "15326d2e-d284-4c7e-86b1-5bfbba74a914",
+              "call_id": "8b8b3ad5-5e47-4f56-a823-e2d82fa72d9c",
               "tool_name": "get_boiling_point"
             },
             "type": "tool_call"
@@ -1184,7 +1229,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Give me a sentence that contains the word: hello', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Call get_boiling_point and answer What is the boiling point of polyjuice?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point_with_metadata', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
     "chunks": [
       {
         "event": {
@@ -1204,8 +1249,12 @@
       {
         "event": {
           "delta": {
-            "text": "The",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "started"
+            },
+            "tool_call": "",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -1219,8 +1268,12 @@
       {
         "event": {
           "delta": {
-            "text": " customer smiled and said \"hello\" to the friendly store clerk.",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "{\"type\": \"function\", \"name\": \"",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -1234,35 +1287,35 @@
       {
         "event": {
           "delta": {
-            "text": "",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "get_boiling_point_with_metadata\", \"",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
+            "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
+          "stop_reason": null
         },
         "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\ndf.head()'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.info())\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
+      },
       {
         "event": {
           "delta": {
-            "text": "",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "parameters\": {\"liquid_name\": \"poly",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
+            "value": "progress"
           },
           "logprobs": null,
           "stop_reason": null
@@ -1272,8 +1325,12 @@
       {
         "event": {
           "delta": {
-            "text": "The",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "juice\", \"celcius\": \"true\"}}",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -1287,42 +1344,64 @@
       {
         "event": {
           "delta": {
-            "text": " error message indicates that the `bwrap.core` module",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "succeeded"
+            },
+            "tool_call": {
+              "arguments": {
+                "celcius": "true",
+                "liquid_name": "polyjuice"
+              },
+              "call_id": "3438f2d7-895f-4a94-8e1f-c2f01860ce88",
+              "tool_name": "get_boiling_point_with_metadata"
+            },
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
             "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": null
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
         },
         "metrics": null
       },
       {
         "event": {
           "delta": {
-            "text": " is not found. This is likely because the `bwrap` package is not installed",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "complete"
           },
           "logprobs": null,
-          "stop_reason": null
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
         },
         "metrics": null
-      },
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Give me a sentence that contains the word: hello', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [])]": {
+    "chunks": [
       {
         "event": {
           "delta": {
-            "text": ". To fix this, you can install the `bwrap",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "start"
           },
           "logprobs": null,
           "stop_reason": null
@@ -1332,7 +1411,7 @@
       {
         "event": {
           "delta": {
-            "text": "` package using pip:\n\n```\npip install bwrap\n",
+            "text": "The",
             "type": "text"
           },
           "event_type": {
@@ -1347,7 +1426,7 @@
       {
         "event": {
           "delta": {
-            "text": "```\n\nHowever, since the `bwrap",
+            "text": " customer smiled and said \"hello\" to the friendly store clerk",
             "type": "text"
           },
           "event_type": {
@@ -1362,7 +1441,7 @@
       {
         "event": {
           "delta": {
-            "text": "` package is not a real package, you can ignore this error",
+            "text": ".",
             "type": "text"
           },
           "event_type": {
@@ -1377,27 +1456,35 @@
       {
         "event": {
           "delta": {
-            "text": " and continue with the code.\n\nThe code above will print a summary of",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "complete"
           },
           "logprobs": null,
-          "stop_reason": null
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
         },
         "metrics": null
-      },
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\ndf.head()'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.info())\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+    "chunks": [
       {
         "event": {
           "delta": {
-            "text": " the CSV file, including the number of non-null values in each column",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "start"
           },
           "logprobs": null,
           "stop_reason": null
@@ -1407,7 +1494,7 @@
       {
         "event": {
           "delta": {
-            "text": ", the data types of each column, and a summary of the central",
+            "text": "The",
             "type": "text"
           },
           "event_type": {
@@ -1422,7 +1509,7 @@
       {
         "event": {
           "delta": {
-            "text": " tendency and dispersion of each numeric column.",
+            "text": " error message indicates that the `bwrap.core` module",
             "type": "text"
           },
           "event_type": {
@@ -1437,21 +1524,156 @@
       {
         "event": {
           "delta": {
-            "text": "",
+            "text": " is not found. This is likely because the `bwrap` package is not installed",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
+            "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
+          "stop_reason": null
         },
         "metrics": null
-      }
+      },
+      {
+        "event": {
+          "delta": {
+            "text": ". To fix this, you can install the `bwrap",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "` package using pip:\n\n```\npip install bwrap\n",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "```\n\nHowever, since the `bwrap",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "` package is not a real package, you can ignore this error",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " and continue with the code.\n\nThe code above will print a summary of",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " the CSV file, including the number of non-null values in each column",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": ", the data types of each column, and a summary of the central",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " tendency and dispersion of each numeric column.",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
     ],
     "type": "generator"
   },
@@ -1673,7 +1895,7 @@
       {
         "event": {
           "delta": {
-            "text": " error message indicates that the `bwrap.core` module is",
+            "text": " error message indicates that the `b",
             "type": "text"
           },
           "event_type": {
@@ -1688,7 +1910,7 @@
       {
         "event": {
           "delta": {
-            "text": " not found. This is likely because the",
+            "text": "wrap.core` module is not found",
             "type": "text"
           },
           "event_type": {
@@ -1703,7 +1925,7 @@
       {
         "event": {
           "delta": {
-            "text": " `bwrap` package is not installed. To fix this,",
+            "text": ". This is likely because the `",
             "type": "text"
           },
           "event_type": {
@@ -1718,7 +1940,7 @@
       {
         "event": {
           "delta": {
-            "text": " you can install the `bwrap` package",
+            "text": "bwrap` package is not installed",
             "type": "text"
           },
           "event_type": {
@@ -1733,7 +1955,7 @@
       {
         "event": {
           "delta": {
-            "text": " using pip:\n\n```\npip install bwrap",
+            "text": ". To fix this, you can install the",
             "type": "text"
           },
           "event_type": {
@@ -1748,7 +1970,7 @@
       {
         "event": {
           "delta": {
-            "text": "\n```\n\nHowever, if you don't",
+            "text": " `bwrap` package using pip:\n\n```\npip install",
             "type": "text"
           },
           "event_type": {
@@ -1763,7 +1985,7 @@
       {
         "event": {
           "delta": {
-            "text": " have permission to install packages, you can use",
+            "text": " bwrap\n```\n\nHowever, if",
             "type": "text"
           },
           "event_type": {
@@ -1778,7 +2000,7 @@
       {
         "event": {
           "delta": {
-            "text": " the `knowledge_search` function to get information about",
+            "text": " you don't have the `bwrap` package installed,",
             "type": "text"
           },
           "event_type": {
@@ -1793,7 +2015,7 @@
       {
         "event": {
           "delta": {
-            "text": " the CSV file instead:\n\n```\n{\n   ",
+            "text": " you can't use the `",
             "type": "text"
           },
           "event_type": {
@@ -1808,7 +2030,7 @@
       {
         "event": {
           "delta": {
-            "text": " \"type\": \"function\",\n    \"name\": \"",
+            "text": "b",
             "type": "text"
           },
           "event_type": {
@@ -1823,7 +2045,7 @@
       {
         "event": {
           "delta": {
-            "text": "knowledge_search\",\n    \"parameters\": {\n",
+            "text": "wrap.core` module.",
             "type": "text"
           },
           "event_type": {
@@ -1838,7 +2060,7 @@
       {
         "event": {
           "delta": {
-            "text": "        \"query\": \"describe a csv file\"\n    }\n",
+            "text": " In this case, you can",
             "type": "text"
           },
           "event_type": {
@@ -1853,7 +2075,7 @@
       {
         "event": {
           "delta": {
-            "text": "}\n```\n\nThis will return a description of",
+            "text": " try to load the CSV file using the `p",
             "type": "text"
           },
           "event_type": {
@@ -1868,7 +2090,7 @@
       {
         "event": {
           "delta": {
-            "text": " the CSV file.",
+            "text": "andas` library directly.\n\nHere is the corrected code:\n\n```",
             "type": "text"
           },
           "event_type": {
@@ -1883,35 +2105,27 @@
       {
         "event": {
           "delta": {
-            "text": "",
+            "text": "python\nimport pandas as pd\ndf = pd.read_csv",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
+            "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
+          "stop_reason": null
         },
         "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())\\nprint(df.info())\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
+      },
       {
         "event": {
           "delta": {
-            "text": "",
+            "text": "(\"/var/folders/cz/vyh7y1d11x",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
+            "value": "progress"
           },
           "logprobs": null,
           "stop_reason": null
@@ -1921,7 +2135,7 @@
       {
         "event": {
           "delta": {
-            "text": "The",
+            "text": "g881lsxsshnc5c000",
             "type": "text"
           },
           "event_type": {
@@ -1936,7 +2150,7 @@
       {
         "event": {
           "delta": {
-            "text": " error message indicates that the `bwrap.core` module",
+            "text": "0gn/T/tmp8d5c",
             "type": "text"
           },
           "event_type": {
@@ -1951,7 +2165,7 @@
       {
         "event": {
           "delta": {
-            "text": " is not found. This is likely",
+            "text": "8spc/zOZSE5",
             "type": "text"
           },
           "event_type": {
@@ -1966,7 +2180,7 @@
       {
         "event": {
           "delta": {
-            "text": " because the `bwrap` package is not installed. To fix this",
+            "text": "zcinflation.csv\")\nprint(df.head())\nprint(df.info())\n",
             "type": "text"
           },
           "event_type": {
@@ -1981,7 +2195,7 @@
       {
         "event": {
           "delta": {
-            "text": ", you can install the `bwrap` package using pip:\n\n```\n",
+            "text": "print(df.describe())\n```\n\nThis code will",
             "type": "text"
           },
           "event_type": {
@@ -1996,7 +2210,7 @@
       {
         "event": {
           "delta": {
-            "text": "pip install bwrap\n```\n\nHowever, if you don't have",
+            "text": " load the CSV file and print the first few rows, information about",
             "type": "text"
           },
           "event_type": {
@@ -2011,7 +2225,7 @@
       {
         "event": {
           "delta": {
-            "text": " permission to install packages, you can use the `knowledge_search` function to",
+            "text": " the data, and summary statistics.",
             "type": "text"
           },
           "event_type": {
@@ -2026,27 +2240,35 @@
       {
         "event": {
           "delta": {
-            "text": " get information about the CSV file instead:\n\n```\n{\n",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "complete"
           },
           "logprobs": null,
-          "stop_reason": null
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
         },
         "metrics": null
-      },
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())\\nprint(df.info())\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
       {
         "event": {
           "delta": {
-            "text": "    \"type\": \"function\",\n    \"name\": \"",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "start"
           },
           "logprobs": null,
           "stop_reason": null
@@ -2056,7 +2278,7 @@
       {
         "event": {
           "delta": {
-            "text": "knowledge_search\",\n    \"parameters\": {\n       ",
+            "text": "The",
             "type": "text"
           },
           "event_type": {
@@ -2071,7 +2293,7 @@
       {
         "event": {
           "delta": {
-            "text": " \"query\": \"describe a csv file\"\n    }\n}\n``",
+            "text": " error message indicates that the `bwrap.core` module",
             "type": "text"
           },
           "event_type": {
@@ -2086,7 +2308,7 @@
       {
         "event": {
           "delta": {
-            "text": "`\n\nThis will return a description of the CSV file.",
+            "text": " is not found. This is likely",
             "type": "text"
           },
           "event_type": {
@@ -2101,35 +2323,27 @@
       {
         "event": {
           "delta": {
-            "text": "",
+            "text": " because the `bwrap` package is not installed. To fix this",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
+            "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
+          "stop_reason": null
         },
         "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
+      },
       {
         "event": {
           "delta": {
-            "text": "",
+            "text": ", you can install the `bwrap` package using pip:\n\n```\n",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
+            "value": "progress"
           },
           "logprobs": null,
           "stop_reason": null
@@ -2139,12 +2353,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
+            "text": "pip install bwrap\n```\n\nHowever, if you don't have",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -2158,12 +2368,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "import pandas as pd\ndf = pd.read",
-            "type": "tool_call"
+            "text": " permission to install packages, you can use the `knowledge_search` function to",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -2177,12 +2383,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "_csv(\"/var/folders/cz/vyh7y1d11",
-            "type": "tool_call"
+            "text": " get information about the CSV file instead:\n\n```\n{\n",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -2196,12 +2398,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "xg881lsxsshnc5c0000gn/T/tmpc_",
-            "type": "tool_call"
+            "text": "    \"type\": \"function\",\n    \"name\": \"",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -2215,12 +2413,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "ozqkdv/GwQ6oJB4inflation",
-            "type": "tool_call"
+            "text": "knowledge_search\",\n    \"parameters\": {\n       ",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -2234,12 +2428,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": ".csv\")\nprint(df.head())\nprint(df.info())\nprint(df.describe",
-            "type": "tool_call"
+            "text": " \"query\": \"describe a csv file\"\n    }\n}\n``",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -2253,12 +2443,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "())",
-            "type": "tool_call"
+            "text": "`\n\nThis will return a description of the CSV file.",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -2269,37 +2455,6 @@
         },
         "metrics": null
       },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpc_ozqkdv/GwQ6oJB4inflation.csv\")\nprint(df.head())\nprint(df.info())\nprint(df.describe())"
-              },
-              "call_id": "551648f3-c903-44ef-84ae-0f1dcbaaa68f",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
       {
         "event": {
           "delta": {
@@ -2321,7 +2476,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -2364,26 +2519,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "import pandas as pd\ndf = pd.read",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "_csv(\"/var/folders/cz/v",
+            "tool_call": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/c",
             "type": "tool_call"
           },
           "event_type": {
@@ -2402,7 +2538,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "yh7y1d11xg881",
+            "tool_call": "z/vyh7y1d11xg881lsxsshnc",
             "type": "tool_call"
           },
           "event_type": {
@@ -2421,7 +2557,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "lsxsshnc5c0000gn/T/tmpn9tl",
+            "tool_call": "5c0000gn/T/tmp8d5c8spc",
             "type": "tool_call"
           },
           "event_type": {
@@ -2440,7 +2576,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "gts1/qYsQ3ZJLinflation.csv",
+            "tool_call": "/zOZSE5zcinflation.csv\")\nprint(df.head())\nprint",
             "type": "tool_call"
           },
           "event_type": {
@@ -2459,7 +2595,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "\")\nprint(df.head())\nprint(df.info())\nprint(df.describe())",
+            "tool_call": "(df.info())\nprint(df.describe())",
             "type": "tool_call"
           },
           "event_type": {
@@ -2480,9 +2616,9 @@
             },
             "tool_call": {
               "arguments": {
-                "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpn9tlgts1/qYsQ3ZJLinflation.csv\")\nprint(df.head())\nprint(df.info())\nprint(df.describe())"
+                "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp8d5c8spc/zOZSE5zcinflation.csv\")\nprint(df.head())\nprint(df.info())\nprint(df.describe())"
               },
-              "call_id": "6c3c4895-55a7-4083-b5d1-6ee42bcbe5fa",
+              "call_id": "09b4d9a1-8ee4-4de4-a5a3-91cad464e668",
               "tool_name": {
                 "__enum__": "BuiltinTool",
                 "value": "code_interpreter"
@@ -2523,7 +2659,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -2585,7 +2721,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "_csv(\"/var/folders/cz/vyh",
+            "tool_call": "_csv(\"/var/folders/cz/v",
             "type": "tool_call"
           },
           "event_type": {
@@ -2604,7 +2740,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "7y1d11xg881lsxsshnc5c",
+            "tool_call": "yh7y1d11xg881",
             "type": "tool_call"
           },
           "event_type": {
@@ -2623,7 +2759,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "0000gn/T/tmpc_ozqkdv/Gw",
+            "tool_call": "lsxsshnc5c0000gn/T/tmpn9tl",
             "type": "tool_call"
           },
           "event_type": {
@@ -2642,7 +2778,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "Q6oJB4inflation.csv\")\n",
+            "tool_call": "gts1/qYsQ3ZJLinflation.csv",
             "type": "tool_call"
           },
           "event_type": {
@@ -2661,7 +2797,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "print(df.head())",
+            "tool_call": "\")\nprint(df.head())\nprint(df.info())\nprint(df.describe())",
             "type": "tool_call"
           },
           "event_type": {
@@ -2682,9 +2818,9 @@
             },
             "tool_call": {
               "arguments": {
-                "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpc_ozqkdv/GwQ6oJB4inflation.csv\")\nprint(df.head())"
+                "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpn9tlgts1/qYsQ3ZJLinflation.csv\")\nprint(df.head())\nprint(df.info())\nprint(df.describe())"
               },
-              "call_id": "204b3ad9-ff20-4fab-a055-13da99874d88",
+              "call_id": "6c3c4895-55a7-4083-b5d1-6ee42bcbe5fa",
               "tool_name": {
                 "__enum__": "BuiltinTool",
                 "value": "code_interpreter"
@@ -2725,7 +2861,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -2745,31 +2881,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "import pandas as pd\ndf = pd.read_csv(\"/",
-            "type": "tool_call"
+            "text": "I",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -2783,12 +2896,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "var/folders/cz/vyh7",
-            "type": "tool_call"
+            "text": "'m unable to access the file you provided. However, I can",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -2802,12 +2911,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "y1d11xg881lsxsshnc5c0000",
-            "type": "tool_call"
+            "text": " suggest a general approach to describe a CSV file",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -2821,12 +2926,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "gn/T/tmpn9tlgts1",
-            "type": "tool_call"
+            "text": ".\n\nYou can use the pandas",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -2840,12 +2941,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "/qYsQ3ZJLin",
-            "type": "tool_call"
+            "text": " library in Python to load and inspect the CSV",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -2859,12 +2956,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "flation.csv\")\nprint(df.head())",
-            "type": "tool_call"
+            "text": " file. Here's a general outline of the",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -2878,66 +2971,12 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpn9tlgts1/qYsQ3ZJLinflation.csv\")\nprint(df.head())"
-              },
-              "call_id": "e6c48b40-6504-4043-b3fa-644bd7fafd0f",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are running this code in a notebook, you can use the `upload` button to upload the file. If you are running this code in a script, you need to provide the file path.\\n\\nHere is an example of how you can describe the csv file if you have it in the same directory as your script:\\n\\n```python\\nimport pandas as pd\\n\\n# Load data\\ndf = pd.read_csv(\\'inflation.csv\\')\\n\\n# Print summary of the data\\nprint(df.head())  # Print the first few rows of the data\\nprint(df.info())  # Print information about the data\\nprint(df.describe())  # Print summary statistics about the data\\n```\\n\\nThis will print the first few rows of the data, information about the data, and summary statistics about the data.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert date column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
+            "text": " steps you can follow:\n\n1. Import the pandas library:",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
+            "value": "progress"
           },
           "logprobs": null,
           "stop_reason": null
@@ -2947,7 +2986,7 @@
       {
         "event": {
           "delta": {
-            "text": "This",
+            "text": " `import pandas as pd`\n2. Load the CSV file",
             "type": "text"
           },
           "event_type": {
@@ -2962,7 +3001,7 @@
       {
         "event": {
           "delta": {
-            "text": " code will create a time series plot of the average yearly inflation.",
+            "text": " into a dataframe: `df = pd.read_csv('file.csv",
             "type": "text"
           },
           "event_type": {
@@ -2977,7 +3016,7 @@
       {
         "event": {
           "delta": {
-            "text": " The x-axis represents the year",
+            "text": "')`\n3. Print the first few rows",
             "type": "text"
           },
           "event_type": {
@@ -2992,7 +3031,7 @@
       {
         "event": {
           "delta": {
-            "text": " and the y-axis represents the average inflation",
+            "text": " of the dataframe: `print(df.head())`\n4",
             "type": "text"
           },
           "event_type": {
@@ -3007,7 +3046,7 @@
       {
         "event": {
           "delta": {
-            "text": ". The plot will show the trend of average yearly inflation over the",
+            "text": ". Print the data types of each column",
             "type": "text"
           },
           "event_type": {
@@ -3022,7 +3061,7 @@
       {
         "event": {
           "delta": {
-            "text": " years.\n\nPlease note that you need to replace 'inflation.csv",
+            "text": ": `print(df.dtypes)`\n5",
             "type": "text"
           },
           "event_type": {
@@ -3037,7 +3076,7 @@
       {
         "event": {
           "delta": {
-            "text": "' with the actual path to your csv file. Also, this",
+            "text": ". Print the summary statistics of the dataframe:",
             "type": "text"
           },
           "event_type": {
@@ -3052,7 +3091,7 @@
       {
         "event": {
           "delta": {
-            "text": " code assumes that the csv file has a column named 'date'",
+            "text": " `print(df.describe())`\n\nThis will give you a",
             "type": "text"
           },
           "event_type": {
@@ -3067,7 +3106,7 @@
       {
         "event": {
           "delta": {
-            "text": " and another column named 'inflation'. If",
+            "text": " general idea of the structure and content of the CSV file.",
             "type": "text"
           },
           "event_type": {
@@ -3082,7 +3121,7 @@
       {
         "event": {
           "delta": {
-            "text": " your csv file has different column names, you",
+            "text": " If you need more specific information, you can use other pandas functions",
             "type": "text"
           },
           "event_type": {
@@ -3097,7 +3136,7 @@
       {
         "event": {
           "delta": {
-            "text": " need to adjust the code accordingly.",
+            "text": " to inspect the dataframe.",
             "type": "text"
           },
           "event_type": {
@@ -3130,7 +3169,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are running this code in a notebook, you can use the `upload` button to upload the file. If you are running this code in a script, you need to provide the file path.\\n\\nHere is an example of how you can describe the csv file if you have it in the same directory as your script:\\n\\n```python\\nimport pandas as pd\\n\\n# Load data\\ndf = pd.read_csv(\\'inflation.csv\\')\\n\\n# Print summary of the data\\nprint(df.head())  # Print the first few rows of the data\\nprint(df.info())  # Print information about the data\\nprint(df.describe())  # Print summary statistics about the data\\n```\\n\\nThis will print the first few rows of the data, information about the data, and summary statistics about the data.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -3173,45 +3212,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " data\ndf = pd.read_csv('inflation.csv')\n\n#",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " Convert date column to datetime\ndf['date'] = pd.to",
+            "tool_call": "import pandas as pd\nimport code_interpreter\n\n#",
             "type": "tool_call"
           },
           "event_type": {
@@ -3230,7 +3231,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "_datetime(df['date'])\n\n# Group by year and calculate average",
+            "tool_call": " Load the CSV file\ndf = pd.read_csv(\"/",
             "type": "tool_call"
           },
           "event_type": {
@@ -3249,7 +3250,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " inflation\naverage_inflation = df.groupby(df['date'].",
+            "tool_call": "var/folders/cz/vyh7y",
             "type": "tool_call"
           },
           "event_type": {
@@ -3268,7 +3269,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "dt.year)['inflation'].mean()\n\n# Plot",
+            "tool_call": "1d11xg881lsxsshnc5c000",
             "type": "tool_call"
           },
           "event_type": {
@@ -3287,7 +3288,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " time series\nplt.figure(figsize=(10,6))\nplt",
+            "tool_call": "0gn/T/tmpjxdo91ce/g1r3",
             "type": "tool_call"
           },
           "event_type": {
@@ -3306,7 +3307,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": ".plot(average_inflation.index, average_inflation.values, marker='",
+            "tool_call": "WGZRinflation.csv\")\n\n# Print the first few rows of",
             "type": "tool_call"
           },
           "event_type": {
@@ -3325,7 +3326,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "o')\nplt.title('Average Yearly Inflation')\nplt.xlabel",
+            "tool_call": " the dataframe\nprint(df.head())\n\n#",
             "type": "tool_call"
           },
           "event_type": {
@@ -3344,7 +3345,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "('Year')\nplt.ylabel('",
+            "tool_call": " Print the data types of each column",
             "type": "tool_call"
           },
           "event_type": {
@@ -3363,7 +3364,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "Average Inflation')\nplt.grid(True)\n",
+            "tool_call": "\nprint(df.dtypes)\n\n# Print the summary statistics",
             "type": "tool_call"
           },
           "event_type": {
@@ -3382,7 +3383,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "plt.show()",
+            "tool_call": " of the dataframe\nprint(df.describe())",
             "type": "tool_call"
           },
           "event_type": {
@@ -3403,9 +3404,9 @@
             },
             "tool_call": {
               "arguments": {
-                "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv('inflation.csv')\n\n# Convert date column to datetime\ndf['date'] = pd.to_datetime(df['date'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\n\n# Plot time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()"
+                "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpjxdo91ce/g1r3WGZRinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())"
               },
-              "call_id": "81d7a873-376b-438e-916d-d5454e6ed09e",
+              "call_id": "fbc1b233-207f-4f7b-8298-8d72a86d6f2c",
               "tool_name": {
                 "__enum__": "BuiltinTool",
                 "value": "code_interpreter"
@@ -3446,7 +3447,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a local file, you can use the `load_data` function from the `code_interpreter` library to load the file. \\n\\nHere is an example of how you can do it:\\n\\n```\\nimport pandas as pd\\nfrom code_interpreter import load_data\\n\\n# Load data\\ndf = load_data(\\'inflation.csv\\')\\n\\n# Print summary of the data\\nprint(df.head())\\nprint(df.info())\\nprint(df.describe())\\n```\\n\\nThis will load the csv file and print the first few rows, a summary of the data, and some descriptive statistics. \\n\\nPlease replace \\'inflation.csv\\' with the actual path to your csv file. \\n\\nIf you are using a remote file, you need to provide the actual file path or the file itself. \\n\\nPlease provide the actual file path or the file itself, and I will be happy to help you describe it.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\"inflation.csv\")\\n\\n# Convert date column to datetime\\ndf[\\'date\\'] = pd.to_datetime(df[\\'date\\'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df[\\'date\\'].dt.year)[\\'inflation\\'].mean()\\n\\n# Plot time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker=\\'o\\')\\nplt.title(\\'Average Yearly Inflation\\')\\nplt.xlabel(\\'Year\\')\\nplt.ylabel(\\'Average Inflation\\')\\nplt.grid(True)\\nplt.show()'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -3466,8 +3467,12 @@
       {
         "event": {
           "delta": {
-            "text": "It",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "started"
+            },
+            "tool_call": "",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -3481,8 +3486,12 @@
       {
         "event": {
           "delta": {
-            "text": " seems that the file \"inflation.csv\" does not exist.",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "import pandas as pd\ndf = pd.read_csv",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -3496,8 +3505,12 @@
       {
         "event": {
           "delta": {
-            "text": " \n\nTo plot the average yearly inflation as a time series, you",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "(\"/var/folders/cz/vyh7y1d11x",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -3511,8 +3524,12 @@
       {
         "event": {
           "delta": {
-            "text": " need to provide the actual file path or the",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "g881lsxsshnc5c0000gn/T",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -3526,8 +3543,12 @@
       {
         "event": {
           "delta": {
-            "text": " file itself. If you are using a local file, you can",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "/tmp8d5c8spc/zOZSE5zcin",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -3541,8 +3562,12 @@
       {
         "event": {
           "delta": {
-            "text": " use the `load_data` function from the `code_interpreter",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "flation.csv\")\nprint(df.head())",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -3556,9 +3581,82 @@
       {
         "event": {
           "delta": {
-            "text": "` library to load the file. \n\nHere is an example of how",
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "succeeded"
+            },
+            "tool_call": {
+              "arguments": {
+                "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp8d5c8spc/zOZSE5zcinflation.csv\")\nprint(df.head())"
+              },
+              "call_id": "c19a0d1e-6b44-408f-9839-819436425778",
+              "tool_name": {
+                "__enum__": "BuiltinTool",
+                "value": "code_interpreter"
+              }
+            },
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
             "type": "text"
           },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "started"
+            },
+            "tool_call": "",
+            "type": "tool_call"
+          },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
             "value": "progress"
@@ -3571,8 +3669,12 @@
       {
         "event": {
           "delta": {
-            "text": " you can do it:\n\n```\nimport pandas as pd\nfrom code_inter",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "import pandas as pd\ndf = pd.read_csv(\"/",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -3586,8 +3688,12 @@
       {
         "event": {
           "delta": {
-            "text": "preter import load_data\n\n# Load data\ndf",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "var/folders/cz/vyh7",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -3601,8 +3707,12 @@
       {
         "event": {
           "delta": {
-            "text": " = load_data('inflation.csv')\n\n#",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "y1d11xg881lsxsshnc5c0000",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -3616,8 +3726,12 @@
       {
         "event": {
           "delta": {
-            "text": " Convert date column to datetime\ndf['date'] = pd.to",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "gn/T/tmpn9tlgts1",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -3631,8 +3745,12 @@
       {
         "event": {
           "delta": {
-            "text": "_datetime(df['date'])\n\n# Group by year",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "/qYsQ3ZJLin",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -3646,8 +3764,12 @@
       {
         "event": {
           "delta": {
-            "text": " and calculate average inflation\naverage_inflation = df.groupby(df['date",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "flation.csv\")\nprint(df.head())",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -3661,14 +3783,68 @@
       {
         "event": {
           "delta": {
-            "text": "'].dt.year)['inflation'].mean()\n\n# Plot time series\n",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "succeeded"
+            },
+            "tool_call": {
+              "arguments": {
+                "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpn9tlgts1/qYsQ3ZJLinflation.csv\")\nprint(df.head())"
+              },
+              "call_id": "e6c48b40-6504-4043-b3fa-644bd7fafd0f",
+              "tool_name": {
+                "__enum__": "BuiltinTool",
+                "value": "code_interpreter"
+              }
+            },
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
             "value": "progress"
           },
           "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If the file is too large to be uploaded, you can provide a sample of the file or the code you used to create the file. \\n\\nHere is an example of how you can describe a csv file using pandas:\\n\\n```\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv(\\'inflation.csv\\')\\n# Print the first 5 rows of the data\\nprint(df.head())\\n# Print the last 5 rows of the data\\nprint(df.tail())\\n# Print the summary statistics of the data\\nprint(df.describe())\\n# Print the data types of each column\\nprint(df.dtypes)\\n```\\n\\nThis will give you an idea of what the csv file contains.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert 'date' column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot the time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
           "stop_reason": null
         },
         "metrics": null
@@ -3676,7 +3852,7 @@
       {
         "event": {
           "delta": {
-            "text": "plt.figure(figsize=(10,6))\nplt.plot(average_inflation",
+            "text": "This",
             "type": "text"
           },
           "event_type": {
@@ -3691,7 +3867,7 @@
       {
         "event": {
           "delta": {
-            "text": ".index, average_inflation.values, marker",
+            "text": " code will create a line plot of the",
             "type": "text"
           },
           "event_type": {
@@ -3706,7 +3882,7 @@
       {
         "event": {
           "delta": {
-            "text": "='o')\nplt.title('Average Yearly Inflation')\nplt",
+            "text": " average yearly inflation over time. The x-axis",
             "type": "text"
           },
           "event_type": {
@@ -3721,7 +3897,7 @@
       {
         "event": {
           "delta": {
-            "text": ".xlabel('Year')\nplt.ylabel('Average Inflation')\nplt",
+            "text": " represents the year and the y-axis represents",
             "type": "text"
           },
           "event_type": {
@@ -3736,7 +3912,7 @@
       {
         "event": {
           "delta": {
-            "text": ".grid(True)\nplt.show()\n```\n\nThis",
+            "text": " the average inflation. The plot will also",
             "type": "text"
           },
           "event_type": {
@@ -3751,7 +3927,7 @@
       {
         "event": {
           "delta": {
-            "text": " will load the csv file, convert the date column to datetime",
+            "text": " include a title, labels",
             "type": "text"
           },
           "event_type": {
@@ -3766,7 +3942,7 @@
       {
         "event": {
           "delta": {
-            "text": ", group by year and calculate the average inflation, and then plot the time",
+            "text": " for the x and y axes, and a grid to make it",
             "type": "text"
           },
           "event_type": {
@@ -3781,7 +3957,7 @@
       {
         "event": {
           "delta": {
-            "text": " series.\n\nPlease replace 'inflation.csv' with the actual path to your",
+            "text": " easier to read.\n\nPlease note that you need to replace '",
             "type": "text"
           },
           "event_type": {
@@ -3796,7 +3972,7 @@
       {
         "event": {
           "delta": {
-            "text": " csv file. \n\nIf you are using a",
+            "text": "inflation.csv' with the actual path",
             "type": "text"
           },
           "event_type": {
@@ -3811,7 +3987,7 @@
       {
         "event": {
           "delta": {
-            "text": " remote file, you need to provide the actual file path or the",
+            "text": " to your csv file. Also, this code assumes that the csv",
             "type": "text"
           },
           "event_type": {
@@ -3826,7 +4002,7 @@
       {
         "event": {
           "delta": {
-            "text": " file itself. \n\nPlease provide the actual file path or the file itself,",
+            "text": " file has a column named 'date' and",
             "type": "text"
           },
           "event_type": {
@@ -3841,7 +4017,7 @@
       {
         "event": {
           "delta": {
-            "text": " and I will be happy to",
+            "text": " another column named 'inflation'. If your csv file has",
             "type": "text"
           },
           "event_type": {
@@ -3856,7 +4032,7 @@
       {
         "event": {
           "delta": {
-            "text": " help you plot the average yearly inflation as a time series.",
+            "text": " different column names, you need to adjust the code accordingly.",
             "type": "text"
           },
           "event_type": {
@@ -3889,7 +4065,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a local file, you can use the `load_data` function from the `code_interpreter` library to load the file. \\n\\nHere is an example of how you can do it:\\n\\n```\\nimport pandas as pd\\nfrom code_interpreter import load_data\\n\\n# Load data\\ndf = load_data(\\'inflation.csv\\')\\n\\n# Print summary of the data\\nprint(df.head())\\nprint(df.info())\\nprint(df.describe())\\n```\\n\\nThis will load the csv file and print the first few rows, a summary of the data, and some descriptive statistics. \\n\\nPlease replace \\'inflation.csv\\' with the actual path to your csv file. \\n\\nIf you are using a remote file, you need to provide the actual file path or the file itself. \\n\\nPlease provide the actual file path or the file itself, and I will be happy to help you describe it.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If the file is too large to be uploaded, you can provide a sample of the file or the code you used to create the file. \\n\\nHere is an example of how you can describe a csv file using pandas:\\n\\n```\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv(\\'inflation.csv\\')\\n# Print the first 5 rows of the data\\nprint(df.head())\\n# Print the last 5 rows of the data\\nprint(df.tail())\\n# Print the summary statistics of the data\\nprint(df.describe())\\n# Print the data types of each column\\nprint(df.dtypes)\\n```\\n\\nThis will give you an idea of what the csv file contains.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -3932,7 +4108,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "import pandas as pd\nimport matplotlib.pyplot as",
+            "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load",
             "type": "tool_call"
           },
           "event_type": {
@@ -3951,7 +4127,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " plt\n\n# Load data\ndf = pd.read_csv(\"",
+            "tool_call": " data\ndf = pd.read_csv('inflation.csv')\n\n#",
             "type": "tool_call"
           },
           "event_type": {
@@ -3970,7 +4146,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "inflation.csv\")\n\n# Convert date column to",
+            "tool_call": " Convert 'date' column to datetime\ndf['",
             "type": "tool_call"
           },
           "event_type": {
@@ -3989,7 +4165,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " datetime\ndf['date'] = pd.to_datetime(df['",
+            "tool_call": "date'] = pd.to_datetime(df",
             "type": "tool_call"
           },
           "event_type": {
@@ -4008,7 +4184,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "date'])\n\n# Group by year and calculate average inflation\naverage_in",
+            "tool_call": "['date'])\n\n# Group by year and",
             "type": "tool_call"
           },
           "event_type": {
@@ -4027,7 +4203,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "flation = df.groupby(df['date'].dt.year)['inflation",
+            "tool_call": " calculate average inflation\naverage_inflation =",
             "type": "tool_call"
           },
           "event_type": {
@@ -4046,7 +4222,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "'].mean()\n\n# Plot time series\nplt.figure",
+            "tool_call": " df.groupby(df['date'].dt.year)['inflation'].mean",
             "type": "tool_call"
           },
           "event_type": {
@@ -4065,7 +4241,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "(figsize=(10,6))\nplt.plot",
+            "tool_call": "()\n\n# Plot the time series\nplt.figure(figsize=(10,",
             "type": "tool_call"
           },
           "event_type": {
@@ -4084,7 +4260,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "(average_inflation.index, average_inflation.values, marker='",
+            "tool_call": "6))\nplt.plot(average_inflation.index, average_inflation",
             "type": "tool_call"
           },
           "event_type": {
@@ -4103,7 +4279,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "o')\nplt.title('Average Yearly Inflation')\nplt.xlabel",
+            "tool_call": ".values, marker='o')\nplt.title('Average Yearly In",
             "type": "tool_call"
           },
           "event_type": {
@@ -4122,7 +4298,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True",
+            "tool_call": "flation')\nplt.xlabel('Year')\nplt.ylabel('Average In",
             "type": "tool_call"
           },
           "event_type": {
@@ -4141,7 +4317,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": ")\nplt.show()",
+            "tool_call": "flation')\nplt.grid(True)\nplt.show()",
             "type": "tool_call"
           },
           "event_type": {
@@ -4162,9 +4338,9 @@
             },
             "tool_call": {
               "arguments": {
-                "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"inflation.csv\")\n\n# Convert date column to datetime\ndf['date'] = pd.to_datetime(df['date'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\n\n# Plot time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()"
+                "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv('inflation.csv')\n\n# Convert 'date' column to datetime\ndf['date'] = pd.to_datetime(df['date'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\n\n# Plot the time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()"
               },
-              "call_id": "da4cf054-6301-4408-85a8-35f15d1ff698",
+              "call_id": "6b6c11d8-75d5-4b34-b97b-ee523c7a8168",
               "tool_name": {
                 "__enum__": "BuiltinTool",
                 "value": "code_interpreter"
@@ -4205,7 +4381,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a remote server, you can use the `requests` library to download the file and then load it into a pandas dataframe. \\n\\nHere is an example of how you can do it:\\n\\n```\\nimport pandas as pd\\nimport requests\\n\\n# Download the csv file\\nurl = \"https://example.com/your_file.csv\"\\nresponse = requests.get(url)\\n\\n# Load the csv file into a pandas dataframe\\ndf = pd.read_csv(response.content)\\n\\n# Print the description of the dataframe\\nprint(df.describe())\\n```\\n\\nPlease replace the `url` variable with the actual URL of your csv file. \\n\\nIf you are using a local file, you can simply use the `pd.read_csv()` function with the file path:\\n\\n```\\nimport pandas as pd\\n\\n# Load the csv file into a pandas dataframe\\ndf = pd.read_csv(\\'your_file.csv\\')\\n\\n# Print the description of the dataframe\\nprint(df.describe())\\n```\\n\\nPlease replace `\\'your_file.csv\\'` with the actual path to your csv file.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Convert \\'Year\\' column to datetime\\ndf[\\'Year\\'] = pd.to_datetime(df[\\'Year\\'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(\\'Year\\')[\\'Inflation\\'].mean().reset_index()\\n\\n# Plot average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation[\\'Year\\'], average_inflation[\\'Inflation\\'], marker=\\'o\\')\\nplt.title(\\'Average Yearly Inflation\\')\\nplt.xlabel(\\'Year\\')\\nplt.ylabel(\\'Inflation Rate\\')\\nplt.grid(True)\\nplt.show()'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are running this code in a notebook, you can use the `upload` button to upload the file. If you are running this code in a script, you need to provide the file path.\\n\\nHere is an example of how you can describe the csv file if you have it in the same directory as your script:\\n\\n```python\\nimport pandas as pd\\n\\n# Load data\\ndf = pd.read_csv(\\'inflation.csv\\')\\n\\n# Print summary of the data\\nprint(df.head())  # Print the first few rows of the data\\nprint(df.info())  # Print information about the data\\nprint(df.describe())  # Print summary statistics about the data\\n```\\n\\nThis will print the first few rows of the data, information about the data, and summary statistics about the data.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert date column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -4225,7 +4401,7 @@
       {
         "event": {
           "delta": {
-            "text": "It",
+            "text": "This",
             "type": "text"
           },
           "event_type": {
@@ -4240,7 +4416,7 @@
       {
         "event": {
           "delta": {
-            "text": " seems that the file \"/var/f",
+            "text": " code will create a time series plot of the average yearly inflation.",
             "type": "text"
           },
           "event_type": {
@@ -4255,7 +4431,7 @@
       {
         "event": {
           "delta": {
-            "text": "olders/cz/vyh7y",
+            "text": " The x-axis represents the year",
             "type": "text"
           },
           "event_type": {
@@ -4270,7 +4446,7 @@
       {
         "event": {
           "delta": {
-            "text": "1d11xg881lsx",
+            "text": " and the y-axis represents the average inflation",
             "type": "text"
           },
           "event_type": {
@@ -4285,7 +4461,7 @@
       {
         "event": {
           "delta": {
-            "text": "sshnc5c0000gn",
+            "text": ". The plot will show the trend of average yearly inflation over the",
             "type": "text"
           },
           "event_type": {
@@ -4300,7 +4476,7 @@
       {
         "event": {
           "delta": {
-            "text": "/T/tmpc_ozqkdv/EzGU",
+            "text": " years.\n\nPlease note that you need to replace 'inflation.csv",
             "type": "text"
           },
           "event_type": {
@@ -4315,7 +4491,7 @@
       {
         "event": {
           "delta": {
-            "text": "QEnJinflation.csv\" does",
+            "text": "' with the actual path to your csv file. Also, this",
             "type": "text"
           },
           "event_type": {
@@ -4330,7 +4506,7 @@
       {
         "event": {
           "delta": {
-            "text": " not exist. \n\nTo plot the average yearly inflation as a",
+            "text": " code assumes that the csv file has a column named 'date'",
             "type": "text"
           },
           "event_type": {
@@ -4345,7 +4521,7 @@
       {
         "event": {
           "delta": {
-            "text": " time series, you need to provide the actual file path or",
+            "text": " and another column named 'inflation'. If",
             "type": "text"
           },
           "event_type": {
@@ -4360,7 +4536,7 @@
       {
         "event": {
           "delta": {
-            "text": " the file itself. If you are using a remote server,",
+            "text": " your csv file has different column names, you",
             "type": "text"
           },
           "event_type": {
@@ -4375,7 +4551,7 @@
       {
         "event": {
           "delta": {
-            "text": " you can use the `requests` library to download the file",
+            "text": " need to adjust the code accordingly.",
             "type": "text"
           },
           "event_type": {
@@ -4390,27 +4566,35 @@
       {
         "event": {
           "delta": {
-            "text": " and then load it into a pandas dataframe. \n\nHere",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "complete"
           },
           "logprobs": null,
-          "stop_reason": null
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
         },
         "metrics": null
-      },
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are running this code in a notebook, you can use the `upload` button to upload the file. If you are running this code in a script, you need to provide the file path.\\n\\nHere is an example of how you can describe the csv file if you have it in the same directory as your script:\\n\\n```python\\nimport pandas as pd\\n\\n# Load data\\ndf = pd.read_csv(\\'inflation.csv\\')\\n\\n# Print summary of the data\\nprint(df.head())  # Print the first few rows of the data\\nprint(df.info())  # Print information about the data\\nprint(df.describe())  # Print summary statistics about the data\\n```\\n\\nThis will print the first few rows of the data, information about the data, and summary statistics about the data.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+    "chunks": [
       {
         "event": {
           "delta": {
-            "text": " is an example of how you can do it:\n\n```\nimport",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "start"
           },
           "logprobs": null,
           "stop_reason": null
@@ -4420,8 +4604,12 @@
       {
         "event": {
           "delta": {
-            "text": " pandas as pd\nimport matplotlib.pyplot as plt\nimport requests\n\n",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "started"
+            },
+            "tool_call": "",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -4435,8 +4623,12 @@
       {
         "event": {
           "delta": {
-            "text": "# Download the csv file\nurl = \"https://example.com",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -4450,8 +4642,12 @@
       {
         "event": {
           "delta": {
-            "text": "/your_file.csv\"\nresponse = requests.get(url)\n\n# Load",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " data\ndf = pd.read_csv('inflation.csv')\n\n#",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -4465,8 +4661,12 @@
       {
         "event": {
           "delta": {
-            "text": " the csv file into a pandas dataframe\ndf = pd.read_csv",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " Convert date column to datetime\ndf['date'] = pd.to",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -4480,8 +4680,12 @@
       {
         "event": {
           "delta": {
-            "text": "(response.content)\n\n# Convert 'Year' column to datetime\ndf",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "_datetime(df['date'])\n\n# Group by year and calculate average",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -4495,8 +4699,12 @@
       {
         "event": {
           "delta": {
-            "text": "['Year'] = pd.to_datetime(df['Year'])\n\n# Group",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " inflation\naverage_inflation = df.groupby(df['date'].",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -4510,8 +4718,12 @@
       {
         "event": {
           "delta": {
-            "text": " by year and calculate average inflation\naverage_inflation = df.groupby",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "dt.year)['inflation'].mean()\n\n# Plot",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -4525,8 +4737,12 @@
       {
         "event": {
           "delta": {
-            "text": "('Year')['Inflation'].mean().reset_index()\n\n# Plot",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " time series\nplt.figure(figsize=(10,6))\nplt",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -4540,8 +4756,12 @@
       {
         "event": {
           "delta": {
-            "text": " average yearly inflation as a time series\n",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": ".plot(average_inflation.index, average_inflation.values, marker='",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -4555,8 +4775,12 @@
       {
         "event": {
           "delta": {
-            "text": "plt.figure(figsize=(10,6))\nplt.plot(average_in",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "o')\nplt.title('Average Yearly Inflation')\nplt.xlabel",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -4570,8 +4794,12 @@
       {
         "event": {
           "delta": {
-            "text": "flation['Year'], average_inflation['Inflation'], marker='",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "('Year')\nplt.ylabel('",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -4585,8 +4813,12 @@
       {
         "event": {
           "delta": {
-            "text": "o')\nplt.title('Average Yearly Inflation')\nplt.xlabel",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "Average Inflation')\nplt.grid(True)\n",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -4600,8 +4832,12 @@
       {
         "event": {
           "delta": {
-            "text": "('Year')\nplt.ylabel('Inflation Rate')\nplt.grid(True",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "plt.show()",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -4615,42 +4851,66 @@
       {
         "event": {
           "delta": {
-            "text": ")\nplt.show()\n```\n\nPlease replace the",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "succeeded"
+            },
+            "tool_call": {
+              "arguments": {
+                "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv('inflation.csv')\n\n# Convert date column to datetime\ndf['date'] = pd.to_datetime(df['date'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\n\n# Plot time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()"
+              },
+              "call_id": "81d7a873-376b-438e-916d-d5454e6ed09e",
+              "tool_name": {
+                "__enum__": "BuiltinTool",
+                "value": "code_interpreter"
+              }
+            },
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
             "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": null
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
         },
         "metrics": null
       },
       {
         "event": {
           "delta": {
-            "text": " `url` variable with the actual URL of",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "complete"
           },
           "logprobs": null,
-          "stop_reason": null
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
         },
         "metrics": null
-      },
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a local file, you can use the `load_data` function from the `code_interpreter` library to load the file. \\n\\nHere is an example of how you can do it:\\n\\n```\\nimport pandas as pd\\nfrom code_interpreter import load_data\\n\\n# Load data\\ndf = load_data(\\'inflation.csv\\')\\n\\n# Print summary of the data\\nprint(df.head())\\nprint(df.info())\\nprint(df.describe())\\n```\\n\\nThis will load the csv file and print the first few rows, a summary of the data, and some descriptive statistics. \\n\\nPlease replace \\'inflation.csv\\' with the actual path to your csv file. \\n\\nIf you are using a remote file, you need to provide the actual file path or the file itself. \\n\\nPlease provide the actual file path or the file itself, and I will be happy to help you describe it.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\"inflation.csv\")\\n\\n# Convert date column to datetime\\ndf[\\'date\\'] = pd.to_datetime(df[\\'date\\'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df[\\'date\\'].dt.year)[\\'inflation\\'].mean()\\n\\n# Plot time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker=\\'o\\')\\nplt.title(\\'Average Yearly Inflation\\')\\nplt.xlabel(\\'Year\\')\\nplt.ylabel(\\'Average Inflation\\')\\nplt.grid(True)\\nplt.show()'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+    "chunks": [
       {
         "event": {
           "delta": {
-            "text": " your csv file. \n\nIf you",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "start"
           },
           "logprobs": null,
           "stop_reason": null
@@ -4660,7 +4920,7 @@
       {
         "event": {
           "delta": {
-            "text": " are using a local file, you can",
+            "text": "It",
             "type": "text"
           },
           "event_type": {
@@ -4675,7 +4935,7 @@
       {
         "event": {
           "delta": {
-            "text": " simply use the `pd.read_csv()` function with the file",
+            "text": " seems that the file \"inflation.csv\" does not exist.",
             "type": "text"
           },
           "event_type": {
@@ -4690,7 +4950,7 @@
       {
         "event": {
           "delta": {
-            "text": " path:\n\n```\nimport pandas as pd\nimport matplotlib.pyplot as",
+            "text": " \n\nTo plot the average yearly inflation as a time series, you",
             "type": "text"
           },
           "event_type": {
@@ -4705,7 +4965,7 @@
       {
         "event": {
           "delta": {
-            "text": " plt\n\n# Load the csv file into a pandas dataframe\ndf",
+            "text": " need to provide the actual file path or the",
             "type": "text"
           },
           "event_type": {
@@ -4720,7 +4980,7 @@
       {
         "event": {
           "delta": {
-            "text": " = pd.read_csv('your_file.csv')\n\n# Convert 'Year",
+            "text": " file itself. If you are using a local file, you can",
             "type": "text"
           },
           "event_type": {
@@ -4735,7 +4995,7 @@
       {
         "event": {
           "delta": {
-            "text": "' column to datetime\ndf['Year'] = pd.to_datetime",
+            "text": " use the `load_data` function from the `code_interpreter",
             "type": "text"
           },
           "event_type": {
@@ -4750,7 +5010,7 @@
       {
         "event": {
           "delta": {
-            "text": "(df['Year'])\n\n# Group by",
+            "text": "` library to load the file. \n\nHere is an example of how",
             "type": "text"
           },
           "event_type": {
@@ -4765,7 +5025,7 @@
       {
         "event": {
           "delta": {
-            "text": " year and calculate average inflation\naverage_inflation = df.groupby('",
+            "text": " you can do it:\n\n```\nimport pandas as pd\nfrom code_inter",
             "type": "text"
           },
           "event_type": {
@@ -4780,7 +5040,7 @@
       {
         "event": {
           "delta": {
-            "text": "Year')['Inflation'].mean().reset_index()\n\n# Plot average",
+            "text": "preter import load_data\n\n# Load data\ndf",
             "type": "text"
           },
           "event_type": {
@@ -4795,7 +5055,7 @@
       {
         "event": {
           "delta": {
-            "text": " yearly inflation as a time series\nplt.figure",
+            "text": " = load_data('inflation.csv')\n\n#",
             "type": "text"
           },
           "event_type": {
@@ -4810,7 +5070,7 @@
       {
         "event": {
           "delta": {
-            "text": "(figsize=(10,6))\nplt.plot(average_inflation",
+            "text": " Convert date column to datetime\ndf['date'] = pd.to",
             "type": "text"
           },
           "event_type": {
@@ -4825,7 +5085,7 @@
       {
         "event": {
           "delta": {
-            "text": "['Year'], average_inflation['Inflation'], marker='o",
+            "text": "_datetime(df['date'])\n\n# Group by year",
             "type": "text"
           },
           "event_type": {
@@ -4840,7 +5100,7 @@
       {
         "event": {
           "delta": {
-            "text": "')\nplt.title('Average Yearly Inflation')\nplt.xlabel('",
+            "text": " and calculate average inflation\naverage_inflation = df.groupby(df['date",
             "type": "text"
           },
           "event_type": {
@@ -4855,7 +5115,7 @@
       {
         "event": {
           "delta": {
-            "text": "Year')\nplt.ylabel('Inflation Rate')\nplt.grid(True)\n",
+            "text": "'].dt.year)['inflation'].mean()\n\n# Plot time series\n",
             "type": "text"
           },
           "event_type": {
@@ -4870,7 +5130,7 @@
       {
         "event": {
           "delta": {
-            "text": "plt.show()\n```\n\nPlease replace `'",
+            "text": "plt.figure(figsize=(10,6))\nplt.plot(average_inflation",
             "type": "text"
           },
           "event_type": {
@@ -4885,7 +5145,7 @@
       {
         "event": {
           "delta": {
-            "text": "your_file.csv'` with the actual",
+            "text": ".index, average_inflation.values, marker",
             "type": "text"
           },
           "event_type": {
@@ -4900,7 +5160,7 @@
       {
         "event": {
           "delta": {
-            "text": " path to your csv file.",
+            "text": "='o')\nplt.title('Average Yearly Inflation')\nplt",
             "type": "text"
           },
           "event_type": {
@@ -4915,35 +5175,27 @@
       {
         "event": {
           "delta": {
-            "text": "",
+            "text": ".xlabel('Year')\nplt.ylabel('Average Inflation')\nplt",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
+            "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
+          "stop_reason": null
         },
         "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a remote server, you can use the `requests` library to download the file and then load it into a pandas dataframe. \\n\\nHere is an example of how you can do it:\\n\\n```\\nimport pandas as pd\\nimport requests\\n\\n# Download the csv file\\nurl = \"https://example.com/your_file.csv\"\\nresponse = requests.get(url)\\n\\n# Load the csv file into a pandas dataframe\\ndf = pd.read_csv(response.content)\\n\\n# Print the description of the dataframe\\nprint(df.describe())\\n```\\n\\nPlease replace the `url` variable with the actual URL of your csv file. \\n\\nIf you are using a local file, you can simply use the `pd.read_csv()` function with the file path:\\n\\n```\\nimport pandas as pd\\n\\n# Load the csv file into a pandas dataframe\\ndf = pd.read_csv(\\'your_file.csv\\')\\n\\n# Print the description of the dataframe\\nprint(df.describe())\\n```\\n\\nPlease replace `\\'your_file.csv\\'` with the actual path to your csv file.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
+      },
       {
         "event": {
           "delta": {
-            "text": "",
+            "text": ".grid(True)\nplt.show()\n```\n\nThis",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
+            "value": "progress"
           },
           "logprobs": null,
           "stop_reason": null
@@ -4953,12 +5205,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
+            "text": " will load the csv file, convert the date column to datetime",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -4972,12 +5220,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load",
-            "type": "tool_call"
+            "text": ", group by year and calculate the average inflation, and then plot the time",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -4991,12 +5235,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " data\ndf = pd.read_csv(\"/var/folders/cz",
-            "type": "tool_call"
+            "text": " series.\n\nPlease replace 'inflation.csv' with the actual path to your",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -5010,12 +5250,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "/vyh7y1d11x",
-            "type": "tool_call"
+            "text": " csv file. \n\nIf you are using a",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -5029,12 +5265,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "g881lsxsshnc5c0000gn/T/tmpc",
-            "type": "tool_call"
+            "text": " remote file, you need to provide the actual file path or the",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -5048,12 +5280,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "_ozqkdv/EzGUQEnJinflation",
-            "type": "tool_call"
+            "text": " file itself. \n\nPlease provide the actual file path or the file itself,",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -5067,12 +5295,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": ".csv\")\n\n# Convert 'Year' column",
-            "type": "tool_call"
+            "text": " and I will be happy to",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -5086,12 +5310,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " to datetime\ndf['Year'] = pd.to_datetime(df['",
-            "type": "tool_call"
+            "text": " help you plot the average yearly inflation as a time series.",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -5105,35 +5325,35 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "Year'])\n\n# Group by year and calculate average inflation\naverage_in",
-            "type": "tool_call"
+            "text": "",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "complete"
           },
           "logprobs": null,
-          "stop_reason": null
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
         },
         "metrics": null
-      },
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a local file, you can use the `load_data` function from the `code_interpreter` library to load the file. \\n\\nHere is an example of how you can do it:\\n\\n```\\nimport pandas as pd\\nfrom code_interpreter import load_data\\n\\n# Load data\\ndf = load_data(\\'inflation.csv\\')\\n\\n# Print summary of the data\\nprint(df.head())\\nprint(df.info())\\nprint(df.describe())\\n```\\n\\nThis will load the csv file and print the first few rows, a summary of the data, and some descriptive statistics. \\n\\nPlease replace \\'inflation.csv\\' with the actual path to your csv file. \\n\\nIf you are using a remote file, you need to provide the actual file path or the file itself. \\n\\nPlease provide the actual file path or the file itself, and I will be happy to help you describe it.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+    "chunks": [
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "flation = df.groupby('Year')['Inflation'].mean().reset",
-            "type": "tool_call"
+            "text": "",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "start"
           },
           "logprobs": null,
           "stop_reason": null
@@ -5145,9 +5365,9 @@
           "delta": {
             "parse_status": {
               "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
+              "value": "started"
             },
-            "tool_call": "_index()\n\n# Plot average yearly inflation as a time series\nplt",
+            "tool_call": "",
             "type": "tool_call"
           },
           "event_type": {
@@ -5166,7 +5386,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": ".figure(figsize=(10,6))\nplt",
+            "tool_call": "import pandas as pd\nimport matplotlib.pyplot as",
             "type": "tool_call"
           },
           "event_type": {
@@ -5185,7 +5405,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": ".plot(average_inflation['Year'], average_inflation['In",
+            "tool_call": " plt\n\n# Load data\ndf = pd.read_csv(\"",
             "type": "tool_call"
           },
           "event_type": {
@@ -5204,7 +5424,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "flation'], marker='o')\nplt.title('Average Yearly Inflation')\n",
+            "tool_call": "inflation.csv\")\n\n# Convert date column to",
             "type": "tool_call"
           },
           "event_type": {
@@ -5223,7 +5443,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "plt.xlabel('Year')\nplt.ylabel('Inflation Rate')\nplt.grid(True",
+            "tool_call": " datetime\ndf['date'] = pd.to_datetime(df['",
             "type": "tool_call"
           },
           "event_type": {
@@ -5242,7 +5462,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": ")\nplt.show()",
+            "tool_call": "date'])\n\n# Group by year and calculate average inflation\naverage_in",
             "type": "tool_call"
           },
           "event_type": {
@@ -5259,18 +5479,9 @@
           "delta": {
             "parse_status": {
               "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpc_ozqkdv/EzGUQEnJinflation.csv\")\n\n# Convert 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation['Year'], average_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation Rate')\nplt.grid(True)\nplt.show()"
-              },
-              "call_id": "7e62f796-c5cd-4021-a651-b0048b75a083",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
+              "value": "in_progress"
             },
+            "tool_call": "flation = df.groupby(df['date'].dt.year)['inflation",
             "type": "tool_call"
           },
           "event_type": {
@@ -5278,45 +5489,23 @@
             "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
+          "stop_reason": null
         },
         "metrics": null
       },
       {
         "event": {
           "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "'].mean()\n\n# Plot time series\nplt.figure",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
+            "value": "progress"
           },
           "logprobs": null,
           "stop_reason": null
@@ -5326,8 +5515,12 @@
       {
         "event": {
           "delta": {
-            "text": "It",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "(figsize=(10,6))\nplt.plot",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -5341,8 +5534,12 @@
       {
         "event": {
           "delta": {
-            "text": " seems that the file \"/var/f",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "(average_inflation.index, average_inflation.values, marker='",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -5356,8 +5553,12 @@
       {
         "event": {
           "delta": {
-            "text": "olders/cz/vyh7y1d11x",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "o')\nplt.title('Average Yearly Inflation')\nplt.xlabel",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -5371,8 +5572,12 @@
       {
         "event": {
           "delta": {
-            "text": "g881lsxsshnc5c000",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -5386,8 +5591,12 @@
       {
         "event": {
           "delta": {
-            "text": "0gn/T/tmpc",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": ")\nplt.show()",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -5401,42 +5610,66 @@
       {
         "event": {
           "delta": {
-            "text": "_ozqkdv/EzGUQEnJinflation",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "succeeded"
+            },
+            "tool_call": {
+              "arguments": {
+                "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"inflation.csv\")\n\n# Convert date column to datetime\ndf['date'] = pd.to_datetime(df['date'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\n\n# Plot time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()"
+              },
+              "call_id": "da4cf054-6301-4408-85a8-35f15d1ff698",
+              "tool_name": {
+                "__enum__": "BuiltinTool",
+                "value": "code_interpreter"
+              }
+            },
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
             "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": null
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
         },
         "metrics": null
       },
       {
         "event": {
           "delta": {
-            "text": ".csv\" does not exist. \n\nTo",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "complete"
           },
           "logprobs": null,
-          "stop_reason": null
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
         },
         "metrics": null
-      },
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a remote server or a local machine, you can use the `pd.read_csv()` function to load the csv file. \\n\\nHere is an example:\\n\\n```python\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv(\\'inflation.csv\\')\\n# Print the first 5 rows of the dataframe\\nprint(df.head())\\n# Print the summary of the dataframe\\nprint(df.info())\\nprint(df.describe())\\n```\\n\\nThis will print the first 5 rows of the dataframe, the summary of the dataframe (including the index dtype and column count), and the description of the dataframe (including count, mean, std, min, 25%, 50%, 75%, max for each column).', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert 'date' column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot the time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+    "chunks": [
       {
         "event": {
           "delta": {
-            "text": " describe the csv file, you need to provide the actual file",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "start"
           },
           "logprobs": null,
           "stop_reason": null
@@ -5446,7 +5679,7 @@
       {
         "event": {
           "delta": {
-            "text": " path or the file itself. If you",
+            "text": "This",
             "type": "text"
           },
           "event_type": {
@@ -5461,7 +5694,7 @@
       {
         "event": {
           "delta": {
-            "text": " are using a remote server, you can use the `requests` library",
+            "text": " code will create a line plot of",
             "type": "text"
           },
           "event_type": {
@@ -5476,7 +5709,7 @@
       {
         "event": {
           "delta": {
-            "text": " to download the file and then load it into a pandas dataframe. \n\nHere",
+            "text": " the average yearly inflation over time. The x-axis",
             "type": "text"
           },
           "event_type": {
@@ -5491,7 +5724,7 @@
       {
         "event": {
           "delta": {
-            "text": " is an example of how you can do it:\n\n```\nimport pandas as",
+            "text": " represents the year and the y-axis represents the average",
             "type": "text"
           },
           "event_type": {
@@ -5506,7 +5739,7 @@
       {
         "event": {
           "delta": {
-            "text": " pd\nimport requests\n\n# Download the csv file\nurl = \"https",
+            "text": " inflation. The plot also includes a title, labels for the x",
             "type": "text"
           },
           "event_type": {
@@ -5521,7 +5754,7 @@
       {
         "event": {
           "delta": {
-            "text": "://example.com/your_file.csv\"\nresponse = requests.get(url)\n\n#",
+            "text": " and y axes, and a grid for",
             "type": "text"
           },
           "event_type": {
@@ -5536,7 +5769,7 @@
       {
         "event": {
           "delta": {
-            "text": " Load the csv file into a pandas dataframe\ndf",
+            "text": " better visibility.\n\nPlease note that you need",
             "type": "text"
           },
           "event_type": {
@@ -5551,7 +5784,7 @@
       {
         "event": {
           "delta": {
-            "text": " = pd.read_csv(response.content)\n\n# Print",
+            "text": " to replace 'inflation.csv' with the actual path to your",
             "type": "text"
           },
           "event_type": {
@@ -5566,7 +5799,7 @@
       {
         "event": {
           "delta": {
-            "text": " the description of the dataframe\nprint",
+            "text": " csv file. Also, this code assumes that the 'date",
             "type": "text"
           },
           "event_type": {
@@ -5581,7 +5814,7 @@
       {
         "event": {
           "delta": {
-            "text": "(df.describe())\n```\n\nPlease replace the `url`",
+            "text": "' column in your csv file is in a format that can be",
             "type": "text"
           },
           "event_type": {
@@ -5596,7 +5829,7 @@
       {
         "event": {
           "delta": {
-            "text": " variable with the actual URL of your csv file. \n\nIf",
+            "text": " parsed by pandas' `to_datetime` function. If your date",
             "type": "text"
           },
           "event_type": {
@@ -5611,7 +5844,7 @@
       {
         "event": {
           "delta": {
-            "text": " you are using a",
+            "text": " column is in a different format, you may need to specify the",
             "type": "text"
           },
           "event_type": {
@@ -5626,7 +5859,7 @@
       {
         "event": {
           "delta": {
-            "text": " local file, you can simply use the `pd.read_csv",
+            "text": " format when calling `to_datetime`.",
             "type": "text"
           },
           "event_type": {
@@ -5641,27 +5874,35 @@
       {
         "event": {
           "delta": {
-            "text": "()` function with the file path:\n\n```\nimport pandas as",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "complete"
           },
           "logprobs": null,
-          "stop_reason": null
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
         },
         "metrics": null
-      },
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a remote server or a local machine, you can use the `pd.read_csv()` function to load the csv file. \\n\\nHere is an example:\\n\\n```python\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv(\\'inflation.csv\\')\\n# Print the first 5 rows of the dataframe\\nprint(df.head())\\n# Print the summary of the dataframe\\nprint(df.info())\\nprint(df.describe())\\n```\\n\\nThis will print the first 5 rows of the dataframe, the summary of the dataframe (including the index dtype and column count), and the description of the dataframe (including count, mean, std, min, 25%, 50%, 75%, max for each column).', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+    "chunks": [
       {
         "event": {
           "delta": {
-            "text": " pd\n\n#",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "start"
           },
           "logprobs": null,
           "stop_reason": null
@@ -5671,8 +5912,12 @@
       {
         "event": {
           "delta": {
-            "text": " Load the csv file into a pandas",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "started"
+            },
+            "tool_call": "",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -5686,8 +5931,12 @@
       {
         "event": {
           "delta": {
-            "text": " dataframe\ndf = pd.read_csv('your",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -5701,8 +5950,12 @@
       {
         "event": {
           "delta": {
-            "text": "_file.csv')\n\n# Print the description of",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "\ndf = pd.read_csv('inflation.csv')\n\n#",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -5716,8 +5969,12 @@
       {
         "event": {
           "delta": {
-            "text": " the dataframe\nprint(df.describe())\n``",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " Convert 'date' column to datetime\ndf['date']",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -5731,8 +5988,12 @@
       {
         "event": {
           "delta": {
-            "text": "`\n\nPlease replace `'your_file.csv'` with the actual path",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " = pd.to_datetime(df['date'])\n\n# Group by",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -5746,8 +6007,12 @@
       {
         "event": {
           "delta": {
-            "text": " to your csv file.",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " year and calculate average inflation\naverage_in",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -5761,35 +6026,16 @@
       {
         "event": {
           "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "flation = df.groupby(df['date'].dt.year",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
+            "value": "progress"
           },
           "logprobs": null,
           "stop_reason": null
@@ -5801,9 +6047,9 @@
           "delta": {
             "parse_status": {
               "__enum__": "ToolCallParseStatus",
-              "value": "started"
+              "value": "in_progress"
             },
-            "tool_call": "",
+            "tool_call": ")['inflation'].mean()\n\n# Plot the time series",
             "type": "tool_call"
           },
           "event_type": {
@@ -5822,7 +6068,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "import pandas as pd\n# Load data\ndf = pd",
+            "tool_call": "\nplt.figure(figsize=(10,6))\nplt.plot(average_in",
             "type": "tool_call"
           },
           "event_type": {
@@ -5841,7 +6087,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": ".read_csv(\"/var",
+            "tool_call": "flation.index, average_inflation.values, marker",
             "type": "tool_call"
           },
           "event_type": {
@@ -5860,7 +6106,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "/folders/cz/vyh7y1d11xg881",
+            "tool_call": "='o')\nplt.title('Average Yearly Inflation')\n",
             "type": "tool_call"
           },
           "event_type": {
@@ -5879,7 +6125,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "lsxsshnc5c0000gn/T/tmpc_oz",
+            "tool_call": "plt.xlabel('Year')\nplt.ylabel('Average",
             "type": "tool_call"
           },
           "event_type": {
@@ -5898,7 +6144,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "qkdv/EzGUQEnJinflation.csv\")\n",
+            "tool_call": " Inflation')\nplt.grid(True)\nplt.show()",
             "type": "tool_call"
           },
           "event_type": {
@@ -5915,127 +6161,13 @@
           "delta": {
             "parse_status": {
               "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "# Rows\nprint(\"Number of rows and columns in the data",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": ":\", df.shape)\n# Columns\nprint(\"Columns of the data",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " are:\", len(df.columns))\n# Column names\n",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "print(\"Columns of the data are:\", df.columns)\n",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "# Column dtypes\nprint(\"Datatype of",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " the columns are:\", df.dtypes)",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
+              "value": "succeeded"
             },
             "tool_call": {
               "arguments": {
-                "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpc_ozqkdv/EzGUQEnJinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)"
+                "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv('inflation.csv')\n\n# Convert 'date' column to datetime\ndf['date'] = pd.to_datetime(df['date'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\n\n# Plot the time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()"
               },
-              "call_id": "e57ec9d1-68d8-4493-b3d3-0fb683a4663a",
+              "call_id": "65691869-f741-420c-bb73-23a1f8c0d82a",
               "tool_name": {
                 "__enum__": "BuiltinTool",
                 "value": "code_interpreter"
@@ -6076,7 +6208,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:71183\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:98cad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:84988\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:98cad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:84988\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='You can ask your question now. I will help you answer it using the knowledge_search tool results.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'How to use LoRA'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:98cad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:98cad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:84988\\nContent:  with training with LoRA quickly,\\njust specify any config with ``_lora`` in its name, e.g:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n\\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\\nwhich linear layers LoRA should be applied to in the model:\\n\\n* ``lora_attn_modules: List[str]`` accepts a list of strings specifying which layers of the model to apply\\n  LoRA to:\\n\\n  * ``q_proj`` applies LoRA to the query projection layer.\\n  * ``k_proj`` applies LoRA to the key projection layer.\\n  * ``v_proj`` applies LoRA to the value projection layer.\\n  * ``output_proj`` applies LoRA to the attention output projection layer.\\n\\n  Whilst adding more layers to be fine-tuned may improve model accuracy,\\n  this will come at the cost of increased memory usage and reduced training speed.\\n\\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\\n* ``apply_lora_to_output: Bool`` applies LoRA to the model\\'s final output projection.\\n  This is usually a projection to vocabulary space (e.g. in language models), but\\n  other modelling tasks may have different projections - classifier models will project\\n  to the number of classes, for example\\n\\n.. note::\\n\\n  Models which use tied embeddings (such as Gemma and Qwen2 1.5B and 0.5B) for the\\n  final output projection do not support ``apply_lora_to_output``.\\n\\nThese are all specified under the ``model`` flag or config entry, i.e:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"]\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.llama3.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    model.lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\\n\\nSecondly, parameters which control the scale of the impact of LoRA on the model:\\n\\n* ``lora_rank: int`` affects the scale of\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:98cad\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:9c730\\nContent: ora_finetune_label>`.\\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial <qlora_finetune_label>`.\\n\\nLet\\'s take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n.. note::\\n    To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\\n\\nWe can also add :ref:`command-line overrides <cli_override>` as needed, e.g.\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n        checkpointer.checkpoint_dir=<checkpoint_dir> \\\\\\n        tokenizer.path=<checkpoint_dir>/tokenizer.model \\\\\\n        checkpointer.output_dir=<checkpoint_dir>\\n\\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ``<checkpoint_dir>`` used in the :ref:`tune download <tune_download_label>` command above,\\nthen save a final checkpoint in the same directory following the original format. For more details on the\\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive <understand_checkpointer>`.\\n\\n.. note::\\n    To see the full set of configurable parameters for this (and other) configs we can use :ref:`tune cp <tune_cp_cli_label>` to copy (and modify)\\n    the default config. :ref:`tune cp <tune_cp_cli_label>` can be used with recipe scripts too, in case you want to make more custom changes\\n    that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp <tune_cp_cli_label>` see the section on\\n    :ref:`modifying configs <tune_cp_label>` in our \":ref:`finetune_llama_label`\" tutorial.\\n\\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights\\nwill\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -6096,22 +6228,7 @@
       {
         "event": {
           "delta": {
-            "text": "To",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " use LoRA, you can follow these steps",
+            "text": "It",
             "type": "text"
           },
           "event_type": {
@@ -6126,7 +6243,7 @@
       {
         "event": {
           "delta": {
-            "text": ":\n\n1.  Install the necessary packages",
+            "text": " seems that the file \"/var/f",
             "type": "text"
           },
           "event_type": {
@@ -6141,7 +6258,7 @@
       {
         "event": {
           "delta": {
-            "text": ", including torchtune and the Llama2 model.\n",
+            "text": "olders/cz/vyh7y1d11",
             "type": "text"
           },
           "event_type": {
@@ -6156,7 +6273,7 @@
       {
         "event": {
           "delta": {
-            "text": "2.  Load the Llama2 model and specify which",
+            "text": "xg881lsxsshnc5c0000gn/T/tmp8",
             "type": "text"
           },
           "event_type": {
@@ -6171,7 +6288,7 @@
       {
         "event": {
           "delta": {
-            "text": " layers to apply LoRA to.\n3.",
+            "text": "d5c8spc/Q8Y9qzV",
             "type": "text"
           },
           "event_type": {
@@ -6186,7 +6303,7 @@
       {
         "event": {
           "delta": {
-            "text": "  Define the LoRA parameters, such as the rank and",
+            "text": "Xinflation.csv\" does not exist",
             "type": "text"
           },
           "event_type": {
@@ -6201,7 +6318,7 @@
       {
         "event": {
           "delta": {
-            "text": " alpha values.\n4.  Train the model using",
+            "text": ". \n\nTo describe the csv file, you need to provide",
             "type": "text"
           },
           "event_type": {
@@ -6216,7 +6333,7 @@
       {
         "event": {
           "delta": {
-            "text": " the LoRA fine-tuning recipe in torchtune",
+            "text": " the actual file path or the file itself",
             "type": "text"
           },
           "event_type": {
@@ -6231,7 +6348,7 @@
       {
         "event": {
           "delta": {
-            "text": ".\n5.  Use the trained model for inference or further fine",
+            "text": ". If you are using a remote server or a local machine,",
             "type": "text"
           },
           "event_type": {
@@ -6246,7 +6363,7 @@
       {
         "event": {
           "delta": {
-            "text": "-tuning.\n\nHere is an example of how to apply Lo",
+            "text": " you can use the `pd.read_csv()` function to load the",
             "type": "text"
           },
           "event_type": {
@@ -6261,7 +6378,7 @@
       {
         "event": {
           "delta": {
-            "text": "RA to Llama2-7B:\n\n",
+            "text": " csv file. \n\nHere is an example:\n\n```python\nimport",
             "type": "text"
           },
           "event_type": {
@@ -6276,7 +6393,7 @@
       {
         "event": {
           "delta": {
-            "text": "```python\nfrom torchtune.models.llama2 import",
+            "text": " pandas as pd\n# Load data\ndf",
             "type": "text"
           },
           "event_type": {
@@ -6291,7 +6408,7 @@
       {
         "event": {
           "delta": {
-            "text": " llama2_7b, lora_llama2",
+            "text": " = pd.read_csv('inflation.csv",
             "type": "text"
           },
           "event_type": {
@@ -6306,7 +6423,7 @@
       {
         "event": {
           "delta": {
-            "text": "_7b\n\n# Build Llama2 without any Lo",
+            "text": "')\n# Print the first 5 rows of the dataframe\nprint",
             "type": "text"
           },
           "event_type": {
@@ -6321,7 +6438,7 @@
       {
         "event": {
           "delta": {
-            "text": "RA layers\nbase_model = llama2_7b()\n\n",
+            "text": "(df.head())\n# Print the summary of the dataframe\nprint(df",
             "type": "text"
           },
           "event_type": {
@@ -6336,7 +6453,7 @@
       {
         "event": {
           "delta": {
-            "text": "# The default settings for lora_llama",
+            "text": ".info())\nprint(df.describe())\n```\n\nThis will print the first",
             "type": "text"
           },
           "event_type": {
@@ -6351,7 +6468,7 @@
       {
         "event": {
           "delta": {
-            "text": "2_7b will match those for",
+            "text": " 5 rows of the dataframe,",
             "type": "text"
           },
           "event_type": {
@@ -6366,7 +6483,7 @@
       {
         "event": {
           "delta": {
-            "text": " llama2_7b\n# We just need to define",
+            "text": " the summary of the dataframe (including the",
             "type": "text"
           },
           "event_type": {
@@ -6381,7 +6498,7 @@
       {
         "event": {
           "delta": {
-            "text": " which layers we want LoRA applied to.\n# Within each",
+            "text": " index dtype and column count), and the description of the dataframe",
             "type": "text"
           },
           "event_type": {
@@ -6396,7 +6513,7 @@
       {
         "event": {
           "delta": {
-            "text": " self-attention, we can choose from [\"q_proj\",",
+            "text": " (including count, mean, std,",
             "type": "text"
           },
           "event_type": {
@@ -6411,7 +6528,7 @@
       {
         "event": {
           "delta": {
-            "text": " \"k_proj\", \"v_proj\", and \"output_proj\"]",
+            "text": " min, 25%, 50%, 75%, max",
             "type": "text"
           },
           "event_type": {
@@ -6426,7 +6543,7 @@
       {
         "event": {
           "delta": {
-            "text": ".\n# We can also set apply_lora_to_mlp=True",
+            "text": " for each column).",
             "type": "text"
           },
           "event_type": {
@@ -6441,27 +6558,35 @@
       {
         "event": {
           "delta": {
-            "text": " or apply_lora_to_output=True to apply LoRA to other",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "complete"
           },
           "logprobs": null,
-          "stop_reason": null
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
         },
         "metrics": null
-      },
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+    "chunks": [
       {
         "event": {
           "delta": {
-            "text": " linear\n# layers outside of the self-",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "start"
           },
           "logprobs": null,
           "stop_reason": null
@@ -6471,8 +6596,12 @@
       {
         "event": {
           "delta": {
-            "text": "attention.\nlora_model = lora_llama2_7",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "started"
+            },
+            "tool_call": "",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -6486,8 +6615,12 @@
       {
         "event": {
           "delta": {
-            "text": "b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "import pandas as pd\n# Load data",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -6501,8 +6634,12 @@
       {
         "event": {
           "delta": {
-            "text": "```\n\nYou can also customize the LoRA parameters",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "\ndf =",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -6516,8 +6653,12 @@
       {
         "event": {
           "delta": {
-            "text": " by specifying the rank and alpha values:\n\n```python",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " pd.read_csv(\"/var/folders/cz/vyh7",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -6531,8 +6672,12 @@
       {
         "event": {
           "delta": {
-            "text": "\nlora_model = lora_llama2_7b",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "y1d11xg881lsx",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -6546,8 +6691,12 @@
       {
         "event": {
           "delta": {
-            "text": "(lora_attn_modules=[\"q_proj\", \"v_proj\"],",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "sshnc5c0000gn/T",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -6561,8 +6710,12 @@
       {
         "event": {
           "delta": {
-            "text": " lora_rank=8, lora_alpha=16)\n``",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "/tmp8d5c8spc",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -6576,8 +6729,12 @@
       {
         "event": {
           "delta": {
-            "text": "`\n\nTo train the model using the LoRA",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "/Q8Y9qzVXinflation.csv\")\n",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -6591,8 +6748,12 @@
       {
         "event": {
           "delta": {
-            "text": " fine-tuning recipe in torchtune, you can use",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "# Rows\nprint(\"Number of rows and columns in the data",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -6606,8 +6767,12 @@
       {
         "event": {
           "delta": {
-            "text": " the following command:\n\n```bash\ntune run lora_f",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": ":\", df.shape)\n# Columns\nprint(\"Columns of the data",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -6621,8 +6786,12 @@
       {
         "event": {
           "delta": {
-            "text": "inetune_single_device --config llama3/8B_l",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " are:\", len(df.columns))\n# Column names\nprint",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -6636,8 +6805,12 @@
       {
         "event": {
           "delta": {
-            "text": "ora_single_device\n```\n\nThis will",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "(\"Columns of the data are:\", df.columns)\n",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -6651,8 +6824,12 @@
       {
         "event": {
           "delta": {
-            "text": " load the Llama3-8B-Instruct checkpoint and",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "# Column dtypes\nprint(\"Datatype of the columns are",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -6666,8 +6843,12 @@
       {
         "event": {
           "delta": {
-            "text": " tokenizer from the specified directory, then save a final checkpoint in the",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": ":\", df.dtypes)",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -6681,15 +6862,31 @@
       {
         "event": {
           "delta": {
-            "text": " same directory following the original format.",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "succeeded"
+            },
+            "tool_call": {
+              "arguments": {
+                "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp8d5c8spc/Q8Y9qzVXinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)"
+              },
+              "call_id": "15893b4c-5a55-4ea7-9902-8a2f28fa3659",
+              "tool_name": {
+                "__enum__": "BuiltinTool",
+                "value": "code_interpreter"
+              }
+            },
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
             "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": null
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
         },
         "metrics": null
       },
@@ -6714,7 +6911,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:71183\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:98cad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:84988\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:98cad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:84988\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='You can ask your question now. I will help you answer it using the knowledge_search tool results.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:255c3\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:14b97\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:3b16c\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:14b97\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:3b16c\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'How to use LoRA in Torchtune'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:14b97\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:14b97\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:14b97\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:14b97\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune\\'s LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we\\'ve loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\"\"\"\\n    {total_params} total params,\\n    {trainable_params}\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \"\"\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:14b97\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet\\'s run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=[\\'q_proj\\',\\'k_proj\\',\\'v_proj\\',\\'output_proj\\'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune\\'s :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA\\'s low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -6734,7 +6931,7 @@
       {
         "event": {
           "delta": {
-            "text": "{\"",
+            "text": "To",
             "type": "text"
           },
           "event_type": {
@@ -6749,7 +6946,7 @@
       {
         "event": {
           "delta": {
-            "text": "type\": \"function\", \"name\": \"knowledge_search",
+            "text": " use LoRA in Torchtune, you can follow these",
             "type": "text"
           },
           "event_type": {
@@ -6764,7 +6961,7 @@
       {
         "event": {
           "delta": {
-            "text": "\", \"parameters\": {\"query\": \"How to use Lo",
+            "text": " steps:\n\n1.  Install Torchtune and its dependencies",
             "type": "text"
           },
           "event_type": {
@@ -6779,7 +6976,7 @@
       {
         "event": {
           "delta": {
-            "text": "RA\"}}",
+            "text": ".\n2.  Download the Llama",
             "type": "text"
           },
           "event_type": {
@@ -6794,63 +6991,42 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "How to use LoRA"
-              },
-              "call_id": "ee82ce77-7143-4b2f-8eb8-de5f31517b84",
-              "tool_name": "knowledge_search"
-            },
-            "type": "tool_call"
+            "text": "2 weights and tokenizer.\n3.  Use the `l",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
             "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
+          "stop_reason": null
         },
         "metrics": null
       },
       {
         "event": {
           "delta": {
-            "text": "",
+            "text": "ora_llama2_7b` model in Torchtune",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
+            "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
+          "stop_reason": null
         },
         "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:71183\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:98cad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:84988\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:98cad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:84988\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
+      },
       {
         "event": {
           "delta": {
-            "text": "",
+            "text": ", which applies LoRA to the",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
+            "value": "progress"
           },
           "logprobs": null,
           "stop_reason": null
@@ -6860,7 +7036,7 @@
       {
         "event": {
           "delta": {
-            "text": "You",
+            "text": " Q and V projections by default.\n4.",
             "type": "text"
           },
           "event_type": {
@@ -6875,7 +7051,7 @@
       {
         "event": {
           "delta": {
-            "text": " can ask your question now. I will help you answer it using",
+            "text": "  Set the `lora_attn_modules` argument to",
             "type": "text"
           },
           "event_type": {
@@ -6890,7 +7066,7 @@
       {
         "event": {
           "delta": {
-            "text": " the knowledge_search tool results.",
+            "text": " apply LoRA to all linear",
             "type": "text"
           },
           "event_type": {
@@ -6905,35 +7081,27 @@
       {
         "event": {
           "delta": {
-            "text": "",
+            "text": " layers in the self-attention.\n",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
+            "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
+          "stop_reason": null
         },
         "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:7bdfa\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:64211\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:0c95c\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:64211\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:0c95c\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='You can use the following function call to answer the user\\'s question:\\n\\n{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters\": {\"query\": \"How to fine-tune a Llama2 model with LoRA in torchtune\"}}', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'How to use LoRA'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:64211\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:64211\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:0c95c\\nContent:  with training with LoRA quickly,\\njust specify any config with ``_lora`` in its name, e.g:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n\\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\\nwhich linear layers LoRA should be applied to in the model:\\n\\n* ``lora_attn_modules: List[str]`` accepts a list of strings specifying which layers of the model to apply\\n  LoRA to:\\n\\n  * ``q_proj`` applies LoRA to the query projection layer.\\n  * ``k_proj`` applies LoRA to the key projection layer.\\n  * ``v_proj`` applies LoRA to the value projection layer.\\n  * ``output_proj`` applies LoRA to the attention output projection layer.\\n\\n  Whilst adding more layers to be fine-tuned may improve model accuracy,\\n  this will come at the cost of increased memory usage and reduced training speed.\\n\\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\\n* ``apply_lora_to_output: Bool`` applies LoRA to the model\\'s final output projection.\\n  This is usually a projection to vocabulary space (e.g. in language models), but\\n  other modelling tasks may have different projections - classifier models will project\\n  to the number of classes, for example\\n\\n.. note::\\n\\n  Models which use tied embeddings (such as Gemma and Qwen2 1.5B and 0.5B) for the\\n  final output projection do not support ``apply_lora_to_output``.\\n\\nThese are all specified under the ``model`` flag or config entry, i.e:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"]\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.llama3.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    model.lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\\n\\nSecondly, parameters which control the scale of the impact of LoRA on the model:\\n\\n* ``lora_rank: int`` affects the scale of\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:64211\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:1d70c\\nContent: ora_finetune_label>`.\\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial <qlora_finetune_label>`.\\n\\nLet\\'s take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n.. note::\\n    To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\\n\\nWe can also add :ref:`command-line overrides <cli_override>` as needed, e.g.\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n        checkpointer.checkpoint_dir=<checkpoint_dir> \\\\\\n        tokenizer.path=<checkpoint_dir>/tokenizer.model \\\\\\n        checkpointer.output_dir=<checkpoint_dir>\\n\\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ``<checkpoint_dir>`` used in the :ref:`tune download <tune_download_label>` command above,\\nthen save a final checkpoint in the same directory following the original format. For more details on the\\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive <understand_checkpointer>`.\\n\\n.. note::\\n    To see the full set of configurable parameters for this (and other) configs we can use :ref:`tune cp <tune_cp_cli_label>` to copy (and modify)\\n    the default config. :ref:`tune cp <tune_cp_cli_label>` can be used with recipe scripts too, in case you want to make more custom changes\\n    that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp <tune_cp_cli_label>` see the section on\\n    :ref:`modifying configs <tune_cp_label>` in our \":ref:`finetune_llama_label`\" tutorial.\\n\\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights\\nwill\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
+      },
       {
         "event": {
           "delta": {
-            "text": "",
+            "text": "5.  Increase the rank and",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
+            "value": "progress"
           },
           "logprobs": null,
           "stop_reason": null
@@ -6943,7 +7111,7 @@
       {
         "event": {
           "delta": {
-            "text": "To",
+            "text": " alpha values to experiment with different LoRA",
             "type": "text"
           },
           "event_type": {
@@ -6958,7 +7126,7 @@
       {
         "event": {
           "delta": {
-            "text": " use LoRA, you can follow these steps:\n\n",
+            "text": " configurations.\n6.  Run the LoRA finetuning",
             "type": "text"
           },
           "event_type": {
@@ -6973,7 +7141,7 @@
       {
         "event": {
           "delta": {
-            "text": "1.  Install the necessary packages",
+            "text": " recipe in Torchtune using the `lora_finet",
             "type": "text"
           },
           "event_type": {
@@ -6988,7 +7156,7 @@
       {
         "event": {
           "delta": {
-            "text": ", including torchtune and the Llama",
+            "text": "une_distributed` command.\n7.",
             "type": "text"
           },
           "event_type": {
@@ -7003,7 +7171,7 @@
       {
         "event": {
           "delta": {
-            "text": "2 model.\n2.  Load the",
+            "text": "  Monitor the loss curves and adjust the Lo",
             "type": "text"
           },
           "event_type": {
@@ -7018,7 +7186,7 @@
       {
         "event": {
           "delta": {
-            "text": " Llama2 model and specify which layers",
+            "text": "RA configuration as needed to trade off memory and model performance.\n\n",
             "type": "text"
           },
           "event_type": {
@@ -7033,7 +7201,7 @@
       {
         "event": {
           "delta": {
-            "text": " to apply LoRA to.\n3.  Define the",
+            "text": "By following these steps, you can effectively use LoRA in",
             "type": "text"
           },
           "event_type": {
@@ -7048,7 +7216,7 @@
       {
         "event": {
           "delta": {
-            "text": " LoRA parameters, such as the rank",
+            "text": " Torchtune to fine-tune Llama",
             "type": "text"
           },
           "event_type": {
@@ -7063,7 +7231,7 @@
       {
         "event": {
           "delta": {
-            "text": " and alpha values.\n4.  Train the model using",
+            "text": "2 models with a low memory footprint.",
             "type": "text"
           },
           "event_type": {
@@ -7078,27 +7246,35 @@
       {
         "event": {
           "delta": {
-            "text": " the LoRA fine-tuning recipe in torchtune.\n\n",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "complete"
           },
           "logprobs": null,
-          "stop_reason": null
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
         },
         "metrics": null
-      },
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:255c3\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:14b97\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:3b16c\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:14b97\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:3b16c\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
       {
         "event": {
           "delta": {
-            "text": "Here is an example of how to use Lo",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "start"
           },
           "logprobs": null,
           "stop_reason": null
@@ -7108,7 +7284,7 @@
       {
         "event": {
           "delta": {
-            "text": "RA with the Llama2 model:\n\n```python\nfrom",
+            "text": "{\"",
             "type": "text"
           },
           "event_type": {
@@ -7123,7 +7299,7 @@
       {
         "event": {
           "delta": {
-            "text": " torchtune.models.llama2 import",
+            "text": "type\": \"function\", \"name\": \"knowledge_search\", \"",
             "type": "text"
           },
           "event_type": {
@@ -7138,7 +7314,7 @@
       {
         "event": {
           "delta": {
-            "text": " llama2_7b, lora_llama2_7",
+            "text": "parameters\": {\"query\": \"How to use LoRA in",
             "type": "text"
           },
           "event_type": {
@@ -7153,7 +7329,7 @@
       {
         "event": {
           "delta": {
-            "text": "b\n\n# Build Llama2 without any LoRA layers\n",
+            "text": " Torchtune\"}}",
             "type": "text"
           },
           "event_type": {
@@ -7168,27 +7344,63 @@
       {
         "event": {
           "delta": {
-            "text": "base_model = llama2_7b()\n\n# The default settings",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "succeeded"
+            },
+            "tool_call": {
+              "arguments": {
+                "query": "How to use LoRA in Torchtune"
+              },
+              "call_id": "41f1d05b-cfca-4d54-a0de-38a968017c8b",
+              "tool_name": "knowledge_search"
+            },
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
             "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": null
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
         },
         "metrics": null
       },
       {
         "event": {
           "delta": {
-            "text": " for lora_llama2_7b will match those",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:255c3\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:14b97\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:3b16c\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:14b97\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:3b16c\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
           },
           "logprobs": null,
           "stop_reason": null
@@ -7198,7 +7410,7 @@
       {
         "event": {
           "delta": {
-            "text": " for llama2_7b\n#",
+            "text": "I",
             "type": "text"
           },
           "event_type": {
@@ -7213,7 +7425,7 @@
       {
         "event": {
           "delta": {
-            "text": " We just need to define which layers we",
+            "text": "'m ready to help you answer questions about Torchtune based",
             "type": "text"
           },
           "event_type": {
@@ -7228,7 +7440,7 @@
       {
         "event": {
           "delta": {
-            "text": " want LoRA applied to.\n# Within each self-attention",
+            "text": " on the documentation you provided. What's your first question?",
             "type": "text"
           },
           "event_type": {
@@ -7243,12 +7455,35 @@
       {
         "event": {
           "delta": {
-            "text": ", we can choose from [\"q_proj\", \"k_proj",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:292ee\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:47152\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:2513e\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:47152\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:2513e\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'How to use LoRA in Torchtune'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:47152\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:47152\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:47152\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:47152\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune\\'s LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we\\'ve loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\"\"\"\\n    {total_params} total params,\\n    {trainable_params}\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \"\"\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:47152\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet\\'s run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=[\\'q_proj\\',\\'k_proj\\',\\'v_proj\\',\\'output_proj\\'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune\\'s :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA\\'s low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
           },
           "logprobs": null,
           "stop_reason": null
@@ -7258,7 +7493,7 @@
       {
         "event": {
           "delta": {
-            "text": "\", \"v_proj\", and \"output_proj\"].\n#",
+            "text": "To",
             "type": "text"
           },
           "event_type": {
@@ -7273,7 +7508,7 @@
       {
         "event": {
           "delta": {
-            "text": " We can also set apply_lora_to_mlp=True or",
+            "text": " use LoRA in Torchtune, you can follow these steps",
             "type": "text"
           },
           "event_type": {
@@ -7288,7 +7523,7 @@
       {
         "event": {
           "delta": {
-            "text": " apply_lora_to_output=True to apply LoRA to other",
+            "text": ":\n\n1.  Install Torchtune and its dependencies.\n",
             "type": "text"
           },
           "event_type": {
@@ -7303,7 +7538,7 @@
       {
         "event": {
           "delta": {
-            "text": " linear\n# layers outside of the self-attention.\nl",
+            "text": "2.  Download the Llama2 weights and tokenizer.\n",
             "type": "text"
           },
           "event_type": {
@@ -7318,7 +7553,7 @@
       {
         "event": {
           "delta": {
-            "text": "ora_model = lora_llama",
+            "text": "3.  Use the `lora_llama2_",
             "type": "text"
           },
           "event_type": {
@@ -7333,7 +7568,7 @@
       {
         "event": {
           "delta": {
-            "text": "2_7b(lora_attn",
+            "text": "7b` model in Torchtune",
             "type": "text"
           },
           "event_type": {
@@ -7348,7 +7583,7 @@
       {
         "event": {
           "delta": {
-            "text": "_modules=[\"q_proj\", \"v_proj\"])\n\n# Print the",
+            "text": ", which applies LoRA to the Q",
             "type": "text"
           },
           "event_type": {
@@ -7363,7 +7598,7 @@
       {
         "event": {
           "delta": {
-            "text": " first layer's self-attention in the usual Llama2",
+            "text": " and V projections by default.\n4",
             "type": "text"
           },
           "event_type": {
@@ -7378,7 +7613,7 @@
       {
         "event": {
           "delta": {
-            "text": " model\nprint(base_model.layers[0",
+            "text": ".  Load the base model weights into the LoRA model without any",
             "type": "text"
           },
           "event_type": {
@@ -7393,7 +7628,7 @@
       {
         "event": {
           "delta": {
-            "text": "].attn)\n# Print the same for Llama2 with",
+            "text": " conversion necessary.\n5.  Set only LoRA parameters to",
             "type": "text"
           },
           "event_type": {
@@ -7408,7 +7643,7 @@
       {
         "event": {
           "delta": {
-            "text": " LoRA weights\nprint(lora_model.layers[0].",
+            "text": " trainable.\n6.  Run the LoRA finetuning recipe",
             "type": "text"
           },
           "event_type": {
@@ -7423,7 +7658,7 @@
       {
         "event": {
           "delta": {
-            "text": "attn)\n```\n\nThis code will load the Llama",
+            "text": " in Torchtune with the desired configuration.\n\nYou",
             "type": "text"
           },
           "event_type": {
@@ -7438,7 +7673,7 @@
       {
         "event": {
           "delta": {
-            "text": "2 model and apply LoRA to the",
+            "text": " can also experiment with different LoRA configurations, such as",
             "type": "text"
           },
           "event_type": {
@@ -7453,7 +7688,7 @@
       {
         "event": {
           "delta": {
-            "text": " specified layers. You can then train the model using the Lo",
+            "text": " applying LoRA to all linear layers in the self",
             "type": "text"
           },
           "event_type": {
@@ -7468,7 +7703,7 @@
       {
         "event": {
           "delta": {
-            "text": "RA fine-tuning recipe in torchtune",
+            "text": "-attention, increasing the rank, or scaling alpha and rank",
             "type": "text"
           },
           "event_type": {
@@ -7483,7 +7718,7 @@
       {
         "event": {
           "delta": {
-            "text": ".\n\nNote that you will need to modify the code to suit",
+            "text": " together.\n\nBy following these steps, you",
             "type": "text"
           },
           "event_type": {
@@ -7498,7 +7733,7 @@
       {
         "event": {
           "delta": {
-            "text": " your specific use case and requirements. Additionally,",
+            "text": " can use LoRA in Torchtune to fine-tune a",
             "type": "text"
           },
           "event_type": {
@@ -7513,7 +7748,7 @@
       {
         "event": {
           "delta": {
-            "text": " you may need to adjust the LoRA parameters and the training",
+            "text": " Llama2 model with a low memory footprint and achieve good",
             "type": "text"
           },
           "event_type": {
@@ -7528,7 +7763,7 @@
       {
         "event": {
           "delta": {
-            "text": " settings to achieve the desired results.",
+            "text": " performance.",
             "type": "text"
           },
           "event_type": {
@@ -7561,7 +7796,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:7bdfa\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:64211\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:0c95c\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:64211\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:0c95c\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='You can use the following function call to answer the user\\'s question:\\n\\n{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters\": {\"query\": \"How to fine-tune a Llama2 model with LoRA in torchtune\"}}', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:292ee\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:47152\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:2513e\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:47152\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:2513e\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -7596,7 +7831,22 @@
       {
         "event": {
           "delta": {
-            "text": "type\": \"function\", \"name\": \"knowledge_search\", \"",
+            "text": "type\": \"function\", \"name\":",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " \"knowledge_search\", \"parameters\": {\"query\": \"How to",
             "type": "text"
           },
           "event_type": {
@@ -7611,7 +7861,7 @@
       {
         "event": {
           "delta": {
-            "text": "parameters\": {\"query\": \"How to use LoRA\"}}",
+            "text": " use LoRA in Torchtune\"}}",
             "type": "text"
           },
           "event_type": {
@@ -7632,9 +7882,9 @@
             },
             "tool_call": {
               "arguments": {
-                "query": "How to use LoRA"
+                "query": "How to use LoRA in Torchtune"
               },
-              "call_id": "ce86a63d-964a-49a0-8488-29c28ecb2f80",
+              "call_id": "5beb7c24-953b-4ad7-b834-a26522fb5ac7",
               "tool_name": "knowledge_search"
             },
             "type": "tool_call"
@@ -7672,7 +7922,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:7bdfa\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:64211\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:0c95c\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:64211\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:0c95c\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:292ee\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:47152\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:2513e\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:47152\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:2513e\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -7692,52 +7942,7 @@
       {
         "event": {
           "delta": {
-            "text": "You",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " can use the following function call to answer",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the user's question:\n\n{\"type\": \"function\", \"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "name\": \"knowledge_search\", \"parameters\": {\"query\":",
+            "text": "I",
             "type": "text"
           },
           "event_type": {
@@ -7752,7 +7957,7 @@
       {
         "event": {
           "delta": {
-            "text": " \"How to fine-tune a L",
+            "text": "'m ready to help you answer questions about Torchtune based",
             "type": "text"
           },
           "event_type": {
@@ -7767,7 +7972,7 @@
       {
         "event": {
           "delta": {
-            "text": "lama2 model with LoRA in torch",
+            "text": " on the documentation you provided. What's your first question",
             "type": "text"
           },
           "event_type": {
@@ -7782,7 +7987,7 @@
       {
         "event": {
           "delta": {
-            "text": "tune\"}}",
+            "text": "?",
             "type": "text"
           },
           "event_type": {
@@ -7815,7 +8020,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:c4b2d\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:606ad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:e37c3\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:606ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:e37c3\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='You can use the following function call to answer the user\\'s question:\\n\\n{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters\": {\"query\": \"How to fine-tune a Llama2 model with LoRA in torchtune\"}}', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'How to use LoRA'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:606ad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:606ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:e37c3\\nContent:  with training with LoRA quickly,\\njust specify any config with ``_lora`` in its name, e.g:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n\\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\\nwhich linear layers LoRA should be applied to in the model:\\n\\n* ``lora_attn_modules: List[str]`` accepts a list of strings specifying which layers of the model to apply\\n  LoRA to:\\n\\n  * ``q_proj`` applies LoRA to the query projection layer.\\n  * ``k_proj`` applies LoRA to the key projection layer.\\n  * ``v_proj`` applies LoRA to the value projection layer.\\n  * ``output_proj`` applies LoRA to the attention output projection layer.\\n\\n  Whilst adding more layers to be fine-tuned may improve model accuracy,\\n  this will come at the cost of increased memory usage and reduced training speed.\\n\\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\\n* ``apply_lora_to_output: Bool`` applies LoRA to the model\\'s final output projection.\\n  This is usually a projection to vocabulary space (e.g. in language models), but\\n  other modelling tasks may have different projections - classifier models will project\\n  to the number of classes, for example\\n\\n.. note::\\n\\n  Models which use tied embeddings (such as Gemma and Qwen2 1.5B and 0.5B) for the\\n  final output projection do not support ``apply_lora_to_output``.\\n\\nThese are all specified under the ``model`` flag or config entry, i.e:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"]\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.llama3.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    model.lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\\n\\nSecondly, parameters which control the scale of the impact of LoRA on the model:\\n\\n* ``lora_rank: int`` affects the scale of\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:606ad\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:0b7ba\\nContent: ora_finetune_label>`.\\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial <qlora_finetune_label>`.\\n\\nLet\\'s take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n.. note::\\n    To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\\n\\nWe can also add :ref:`command-line overrides <cli_override>` as needed, e.g.\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n        checkpointer.checkpoint_dir=<checkpoint_dir> \\\\\\n        tokenizer.path=<checkpoint_dir>/tokenizer.model \\\\\\n        checkpointer.output_dir=<checkpoint_dir>\\n\\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ``<checkpoint_dir>`` used in the :ref:`tune download <tune_download_label>` command above,\\nthen save a final checkpoint in the same directory following the original format. For more details on the\\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive <understand_checkpointer>`.\\n\\n.. note::\\n    To see the full set of configurable parameters for this (and other) configs we can use :ref:`tune cp <tune_cp_cli_label>` to copy (and modify)\\n    the default config. :ref:`tune cp <tune_cp_cli_label>` can be used with recipe scripts too, in case you want to make more custom changes\\n    that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp <tune_cp_cli_label>` see the section on\\n    :ref:`modifying configs <tune_cp_label>` in our \":ref:`finetune_llama_label`\" tutorial.\\n\\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights\\nwill\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:ab1b9\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:cc646\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:8bcf6\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:cc646\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:8bcf6\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'How to use LoRA in Torchtune'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:cc646\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:cc646\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:cc646\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:cc646\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune\\'s LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we\\'ve loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\"\"\"\\n    {total_params} total params,\\n    {trainable_params}\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \"\"\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:cc646\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet\\'s run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=[\\'q_proj\\',\\'k_proj\\',\\'v_proj\\',\\'output_proj\\'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune\\'s :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA\\'s low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -7850,7 +8055,7 @@
       {
         "event": {
           "delta": {
-            "text": " use LoRA, you can follow these steps",
+            "text": " use LoRA in Torchtune, you",
             "type": "text"
           },
           "event_type": {
@@ -7865,7 +8070,7 @@
       {
         "event": {
           "delta": {
-            "text": ":\n\n1.  Install the necessary packages, including torchtune",
+            "text": " can follow these steps:\n\n1.  Install Torchtune",
             "type": "text"
           },
           "event_type": {
@@ -7880,7 +8085,7 @@
       {
         "event": {
           "delta": {
-            "text": " and the Llama2 model.\n2.",
+            "text": " and its dependencies.\n2.  Download the",
             "type": "text"
           },
           "event_type": {
@@ -7895,7 +8100,7 @@
       {
         "event": {
           "delta": {
-            "text": "  Load the Llama2 model and specify which layers to",
+            "text": " Llama2 weights and tokenizer.\n3",
             "type": "text"
           },
           "event_type": {
@@ -7910,7 +8115,7 @@
       {
         "event": {
           "delta": {
-            "text": " apply LoRA to.\n3.  Define the LoRA parameters",
+            "text": ".  Use the `lora_llama",
             "type": "text"
           },
           "event_type": {
@@ -7925,7 +8130,7 @@
       {
         "event": {
           "delta": {
-            "text": ", such as the rank and alpha values.\n",
+            "text": "2_7b` model in Torchtune, which",
             "type": "text"
           },
           "event_type": {
@@ -7940,7 +8145,7 @@
       {
         "event": {
           "delta": {
-            "text": "4.  Train the model using the LoRA fine-tuning",
+            "text": " applies LoRA to the Q and V",
             "type": "text"
           },
           "event_type": {
@@ -7955,7 +8160,7 @@
       {
         "event": {
           "delta": {
-            "text": " recipe in torchtune.\n5.  Use the",
+            "text": " projections by default.\n4.  Load the base model weights into",
             "type": "text"
           },
           "event_type": {
@@ -7970,7 +8175,7 @@
       {
         "event": {
           "delta": {
-            "text": " trained model for inference or further fine-tuning.\n\nHere is an",
+            "text": " the LoRA model without any conversion necessary.\n5.  Set",
             "type": "text"
           },
           "event_type": {
@@ -7985,7 +8190,7 @@
       {
         "event": {
           "delta": {
-            "text": " example of how to use LoRA with the Llama2 model",
+            "text": " only LoRA parameters to trainable.\n6.  Run the",
             "type": "text"
           },
           "event_type": {
@@ -8000,7 +8205,7 @@
       {
         "event": {
           "delta": {
-            "text": ":\n\n```python\nfrom torchtune.models.llama2 import",
+            "text": " LoRA finetuning recipe in Torchtune with the desired",
             "type": "text"
           },
           "event_type": {
@@ -8015,7 +8220,7 @@
       {
         "event": {
           "delta": {
-            "text": " llama2_7b, lora_llama2_7",
+            "text": " configuration.\n\nYou can also experiment with different LoRA configurations, such",
             "type": "text"
           },
           "event_type": {
@@ -8030,7 +8235,7 @@
       {
         "event": {
           "delta": {
-            "text": "b\n\n# Build Llama2 without any LoRA layers\n",
+            "text": " as applying LoRA to all linear layers in the self-attention",
             "type": "text"
           },
           "event_type": {
@@ -8045,7 +8250,7 @@
       {
         "event": {
           "delta": {
-            "text": "base_model = llama2_7b()\n\n# The default settings for",
+            "text": ", increasing the rank, or scaling alpha and rank together.\n\nBy",
             "type": "text"
           },
           "event_type": {
@@ -8060,7 +8265,7 @@
       {
         "event": {
           "delta": {
-            "text": " lora_llama2_7b will match",
+            "text": " following these steps, you can use LoRA in Torchtune",
             "type": "text"
           },
           "event_type": {
@@ -8075,7 +8280,7 @@
       {
         "event": {
           "delta": {
-            "text": " those for llama2_7b\n# We just need to define",
+            "text": " to fine-tune a Llama2",
             "type": "text"
           },
           "event_type": {
@@ -8090,7 +8295,7 @@
       {
         "event": {
           "delta": {
-            "text": " which layers we want LoRA applied to.\n# Within each self-",
+            "text": " model with parameter-efficient finetuning and memory savings.",
             "type": "text"
           },
           "event_type": {
@@ -8105,27 +8310,35 @@
       {
         "event": {
           "delta": {
-            "text": "attention, we can choose from [\"q_proj\", \"",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "complete"
           },
           "logprobs": null,
-          "stop_reason": null
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
         },
         "metrics": null
-      },
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:ab1b9\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:cc646\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:8bcf6\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:cc646\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:8bcf6\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
       {
         "event": {
           "delta": {
-            "text": "k_proj\", \"v_proj\", and \"output_proj\"].\n# We",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "start"
           },
           "logprobs": null,
           "stop_reason": null
@@ -8135,7 +8348,7 @@
       {
         "event": {
           "delta": {
-            "text": " can also set apply_lora_to_mlp=True or apply_lora",
+            "text": "{\"",
             "type": "text"
           },
           "event_type": {
@@ -8150,7 +8363,7 @@
       {
         "event": {
           "delta": {
-            "text": "_to_output=True to apply LoRA to other linear\n# layers outside",
+            "text": "type\": \"function\", \"name\":",
             "type": "text"
           },
           "event_type": {
@@ -8165,7 +8378,7 @@
       {
         "event": {
           "delta": {
-            "text": " of the self-attention.\nlora_model = lora_llama",
+            "text": " \"knowledge_search\", \"parameters",
             "type": "text"
           },
           "event_type": {
@@ -8180,7 +8393,7 @@
       {
         "event": {
           "delta": {
-            "text": "2_7b(lora_attn_modules=[\"q_proj\", \"",
+            "text": "\": {\"query\": \"How to use LoRA in Tor",
             "type": "text"
           },
           "event_type": {
@@ -8195,7 +8408,7 @@
       {
         "event": {
           "delta": {
-            "text": "v_proj\"])\n\n# Print the first layer's self-attention in the",
+            "text": "chtune\"}}",
             "type": "text"
           },
           "event_type": {
@@ -8210,57 +8423,63 @@
       {
         "event": {
           "delta": {
-            "text": " usual Llama2 model\nprint(base_model.layers[0].at",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "tn)\n# Print the same for Llama2 with LoRA weights",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "succeeded"
+            },
+            "tool_call": {
+              "arguments": {
+                "query": "How to use LoRA in Torchtune"
+              },
+              "call_id": "5af3ef1f-98c0-4c60-9b8b-892b5e921040",
+              "tool_name": "knowledge_search"
+            },
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
             "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": null
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
         },
         "metrics": null
       },
       {
         "event": {
           "delta": {
-            "text": "\nprint(lora_model.layers[0].attn)\n```\n\n",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "complete"
           },
           "logprobs": null,
-          "stop_reason": null
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
         },
         "metrics": null
-      },
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:ab1b9\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:cc646\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:8bcf6\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:cc646\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:8bcf6\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
       {
         "event": {
           "delta": {
-            "text": "This code loads the Llama2 model and applies LoRA to the",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "start"
           },
           "logprobs": null,
           "stop_reason": null
@@ -8270,7 +8489,7 @@
       {
         "event": {
           "delta": {
-            "text": " query and value projection layers. You can modify the",
+            "text": "I",
             "type": "text"
           },
           "event_type": {
@@ -8285,7 +8504,7 @@
       {
         "event": {
           "delta": {
-            "text": " `",
+            "text": "'m ready to help you answer questions about Torchtune based on",
             "type": "text"
           },
           "event_type": {
@@ -8300,7 +8519,7 @@
       {
         "event": {
           "delta": {
-            "text": "lora_attn_modules` parameter to apply LoRA to",
+            "text": " the documentation you provided. What's your first question?",
             "type": "text"
           },
           "event_type": {
@@ -8315,27 +8534,35 @@
       {
         "event": {
           "delta": {
-            "text": " different layers.\n\nTo train the model using the LoRA fine",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "complete"
           },
           "logprobs": null,
-          "stop_reason": null
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
         },
         "metrics": null
-      },
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:c4b2d\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:606ad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:e37c3\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:606ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:e37c3\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='You can use the following function call to answer the user\\'s question:\\n\\n{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters\": {\"query\": \"How to fine-tune a Llama2 model with LoRA in torchtune\"}}', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'How to use LoRA'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:606ad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:606ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:e37c3\\nContent:  with training with LoRA quickly,\\njust specify any config with ``_lora`` in its name, e.g:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n\\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\\nwhich linear layers LoRA should be applied to in the model:\\n\\n* ``lora_attn_modules: List[str]`` accepts a list of strings specifying which layers of the model to apply\\n  LoRA to:\\n\\n  * ``q_proj`` applies LoRA to the query projection layer.\\n  * ``k_proj`` applies LoRA to the key projection layer.\\n  * ``v_proj`` applies LoRA to the value projection layer.\\n  * ``output_proj`` applies LoRA to the attention output projection layer.\\n\\n  Whilst adding more layers to be fine-tuned may improve model accuracy,\\n  this will come at the cost of increased memory usage and reduced training speed.\\n\\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\\n* ``apply_lora_to_output: Bool`` applies LoRA to the model\\'s final output projection.\\n  This is usually a projection to vocabulary space (e.g. in language models), but\\n  other modelling tasks may have different projections - classifier models will project\\n  to the number of classes, for example\\n\\n.. note::\\n\\n  Models which use tied embeddings (such as Gemma and Qwen2 1.5B and 0.5B) for the\\n  final output projection do not support ``apply_lora_to_output``.\\n\\nThese are all specified under the ``model`` flag or config entry, i.e:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"]\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.llama3.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    model.lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\\n\\nSecondly, parameters which control the scale of the impact of LoRA on the model:\\n\\n* ``lora_rank: int`` affects the scale of\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:606ad\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:0b7ba\\nContent: ora_finetune_label>`.\\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial <qlora_finetune_label>`.\\n\\nLet\\'s take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n.. note::\\n    To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\\n\\nWe can also add :ref:`command-line overrides <cli_override>` as needed, e.g.\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n        checkpointer.checkpoint_dir=<checkpoint_dir> \\\\\\n        tokenizer.path=<checkpoint_dir>/tokenizer.model \\\\\\n        checkpointer.output_dir=<checkpoint_dir>\\n\\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ``<checkpoint_dir>`` used in the :ref:`tune download <tune_download_label>` command above,\\nthen save a final checkpoint in the same directory following the original format. For more details on the\\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive <understand_checkpointer>`.\\n\\n.. note::\\n    To see the full set of configurable parameters for this (and other) configs we can use :ref:`tune cp <tune_cp_cli_label>` to copy (and modify)\\n    the default config. :ref:`tune cp <tune_cp_cli_label>` can be used with recipe scripts too, in case you want to make more custom changes\\n    that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp <tune_cp_cli_label>` see the section on\\n    :ref:`modifying configs <tune_cp_label>` in our \":ref:`finetune_llama_label`\" tutorial.\\n\\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights\\nwill\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
       {
         "event": {
           "delta": {
-            "text": "-tuning recipe in torchtune, you can use the",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "start"
           },
           "logprobs": null,
           "stop_reason": null
@@ -8345,7 +8572,7 @@
       {
         "event": {
           "delta": {
-            "text": " following command:\n\n```bash\ntune run l",
+            "text": "To",
             "type": "text"
           },
           "event_type": {
@@ -8360,7 +8587,7 @@
       {
         "event": {
           "delta": {
-            "text": "ora_finetune_single_device --config llama3/",
+            "text": " use LoRA, you can follow these steps",
             "type": "text"
           },
           "event_type": {
@@ -8375,7 +8602,7 @@
       {
         "event": {
           "delta": {
-            "text": "8B_lora_single_device\n```\n\nThis will train the",
+            "text": ":\n\n1.  Install the necessary packages, including torchtune",
             "type": "text"
           },
           "event_type": {
@@ -8390,7 +8617,7 @@
       {
         "event": {
           "delta": {
-            "text": " model for one epoch on a common instruct",
+            "text": " and the Llama2 model.\n2.",
             "type": "text"
           },
           "event_type": {
@@ -8405,7 +8632,7 @@
       {
         "event": {
           "delta": {
-            "text": " dataset. You can modify the command to change the training settings",
+            "text": "  Load the Llama2 model and specify which layers to",
             "type": "text"
           },
           "event_type": {
@@ -8420,7 +8647,7 @@
       {
         "event": {
           "delta": {
-            "text": ", such as the number of epochs or the batch size.\n\nAfter",
+            "text": " apply LoRA to.\n3.  Define the LoRA parameters",
             "type": "text"
           },
           "event_type": {
@@ -8435,7 +8662,7 @@
       {
         "event": {
           "delta": {
-            "text": " training, you can use the trained model for inference or further fine",
+            "text": ", such as the rank and alpha values.\n",
             "type": "text"
           },
           "event_type": {
@@ -8450,7 +8677,7 @@
       {
         "event": {
           "delta": {
-            "text": "-tuning. You can load the model using the `",
+            "text": "4.  Train the model using the LoRA fine-tuning",
             "type": "text"
           },
           "event_type": {
@@ -8465,7 +8692,7 @@
       {
         "event": {
           "delta": {
-            "text": "load_checkpoint` method and use it to make predictions or continue",
+            "text": " recipe in torchtune.\n5.  Use the",
             "type": "text"
           },
           "event_type": {
@@ -8480,7 +8707,7 @@
       {
         "event": {
           "delta": {
-            "text": " training.\n\n```",
+            "text": " trained model for inference or further fine-tuning.\n\nHere is an",
             "type": "text"
           },
           "event_type": {
@@ -8495,35 +8722,12 @@
       {
         "event": {
           "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "out_of_tokens"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:c4b2d\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:606ad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:e37c3\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:606ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:e37c3\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='You can use the following function call to answer the user\\'s question:\\n\\n{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters\": {\"query\": \"How to fine-tune a Llama2 model with LoRA in torchtune\"}}', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
+            "text": " example of how to use LoRA with the Llama2 model",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
+            "value": "progress"
           },
           "logprobs": null,
           "stop_reason": null
@@ -8533,7 +8737,7 @@
       {
         "event": {
           "delta": {
-            "text": "{\"",
+            "text": ":\n\n```python\nfrom torchtune.models.llama2 import",
             "type": "text"
           },
           "event_type": {
@@ -8548,7 +8752,7 @@
       {
         "event": {
           "delta": {
-            "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
+            "text": " llama2_7b, lora_llama2_7",
             "type": "text"
           },
           "event_type": {
@@ -8563,7 +8767,7 @@
       {
         "event": {
           "delta": {
-            "text": "\": {\"query\": \"How to use LoRA\"}}",
+            "text": "b\n\n# Build Llama2 without any LoRA layers\n",
             "type": "text"
           },
           "event_type": {
@@ -8578,63 +8782,42 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "How to use LoRA"
-              },
-              "call_id": "8b617e66-08b4-4e93-8219-29b8b84c4672",
-              "tool_name": "knowledge_search"
-            },
-            "type": "tool_call"
+            "text": "base_model = llama2_7b()\n\n# The default settings for",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
             "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
+          "stop_reason": null
         },
         "metrics": null
       },
       {
         "event": {
           "delta": {
-            "text": "",
+            "text": " lora_llama2_7b will match",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
+            "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
+          "stop_reason": null
         },
         "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:c4b2d\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:606ad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:e37c3\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:606ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:e37c3\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
+      },
       {
         "event": {
           "delta": {
-            "text": "",
+            "text": " those for llama2_7b\n# We just need to define",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
+            "value": "progress"
           },
           "logprobs": null,
           "stop_reason": null
@@ -8644,7 +8827,7 @@
       {
         "event": {
           "delta": {
-            "text": "You",
+            "text": " which layers we want LoRA applied to.\n# Within each self-",
             "type": "text"
           },
           "event_type": {
@@ -8659,7 +8842,7 @@
       {
         "event": {
           "delta": {
-            "text": " can use the following function call to answer the user's question:\n\n",
+            "text": "attention, we can choose from [\"q_proj\", \"",
             "type": "text"
           },
           "event_type": {
@@ -8674,7 +8857,7 @@
       {
         "event": {
           "delta": {
-            "text": "{\"type\": \"function\", \"name\": \"knowledge_search\",",
+            "text": "k_proj\", \"v_proj\", and \"output_proj\"].\n# We",
             "type": "text"
           },
           "event_type": {
@@ -8689,7 +8872,7 @@
       {
         "event": {
           "delta": {
-            "text": " \"parameters\": {\"query\": \"How to fine-tune a",
+            "text": " can also set apply_lora_to_mlp=True or apply_lora",
             "type": "text"
           },
           "event_type": {
@@ -8704,7 +8887,7 @@
       {
         "event": {
           "delta": {
-            "text": " Llama2 model with LoRA in torchtune\"}}",
+            "text": "_to_output=True to apply LoRA to other linear\n# layers outside",
             "type": "text"
           },
           "event_type": {
@@ -8719,35 +8902,27 @@
       {
         "event": {
           "delta": {
-            "text": "",
+            "text": " of the self-attention.\nlora_model = lora_llama",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
+            "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
+          "stop_reason": null
         },
         "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:f4fd3\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:cbc88\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:8892b\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:cbc88\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:8892b\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='You can use the following function call to answer the user\\'s question:\\n\\n{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters\": {\"query\": \"How to fine-tune a Llama2 model with LoRA in torchtune\"}}', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'How to use LoRA'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:cbc88\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:cbc88\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:8892b\\nContent:  with training with LoRA quickly,\\njust specify any config with ``_lora`` in its name, e.g:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n\\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\\nwhich linear layers LoRA should be applied to in the model:\\n\\n* ``lora_attn_modules: List[str]`` accepts a list of strings specifying which layers of the model to apply\\n  LoRA to:\\n\\n  * ``q_proj`` applies LoRA to the query projection layer.\\n  * ``k_proj`` applies LoRA to the key projection layer.\\n  * ``v_proj`` applies LoRA to the value projection layer.\\n  * ``output_proj`` applies LoRA to the attention output projection layer.\\n\\n  Whilst adding more layers to be fine-tuned may improve model accuracy,\\n  this will come at the cost of increased memory usage and reduced training speed.\\n\\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\\n* ``apply_lora_to_output: Bool`` applies LoRA to the model\\'s final output projection.\\n  This is usually a projection to vocabulary space (e.g. in language models), but\\n  other modelling tasks may have different projections - classifier models will project\\n  to the number of classes, for example\\n\\n.. note::\\n\\n  Models which use tied embeddings (such as Gemma and Qwen2 1.5B and 0.5B) for the\\n  final output projection do not support ``apply_lora_to_output``.\\n\\nThese are all specified under the ``model`` flag or config entry, i.e:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"]\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.llama3.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    model.lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\\n\\nSecondly, parameters which control the scale of the impact of LoRA on the model:\\n\\n* ``lora_rank: int`` affects the scale of\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:cbc88\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:9dcb7\\nContent: ora_finetune_label>`.\\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial <qlora_finetune_label>`.\\n\\nLet\\'s take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n.. note::\\n    To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\\n\\nWe can also add :ref:`command-line overrides <cli_override>` as needed, e.g.\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n        checkpointer.checkpoint_dir=<checkpoint_dir> \\\\\\n        tokenizer.path=<checkpoint_dir>/tokenizer.model \\\\\\n        checkpointer.output_dir=<checkpoint_dir>\\n\\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ``<checkpoint_dir>`` used in the :ref:`tune download <tune_download_label>` command above,\\nthen save a final checkpoint in the same directory following the original format. For more details on the\\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive <understand_checkpointer>`.\\n\\n.. note::\\n    To see the full set of configurable parameters for this (and other) configs we can use :ref:`tune cp <tune_cp_cli_label>` to copy (and modify)\\n    the default config. :ref:`tune cp <tune_cp_cli_label>` can be used with recipe scripts too, in case you want to make more custom changes\\n    that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp <tune_cp_cli_label>` see the section on\\n    :ref:`modifying configs <tune_cp_label>` in our \":ref:`finetune_llama_label`\" tutorial.\\n\\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights\\nwill\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
+      },
       {
         "event": {
           "delta": {
-            "text": "",
+            "text": "2_7b(lora_attn_modules=[\"q_proj\", \"",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
+            "value": "progress"
           },
           "logprobs": null,
           "stop_reason": null
@@ -8757,7 +8932,7 @@
       {
         "event": {
           "delta": {
-            "text": "To",
+            "text": "v_proj\"])\n\n# Print the first layer's self-attention in the",
             "type": "text"
           },
           "event_type": {
@@ -8772,7 +8947,7 @@
       {
         "event": {
           "delta": {
-            "text": " use LoRA, you can follow these steps:\n\n1.",
+            "text": " usual Llama2 model\nprint(base_model.layers[0].at",
             "type": "text"
           },
           "event_type": {
@@ -8787,7 +8962,7 @@
       {
         "event": {
           "delta": {
-            "text": "  Install the necessary packages, including torchtune and the L",
+            "text": "tn)\n# Print the same for Llama2 with LoRA weights",
             "type": "text"
           },
           "event_type": {
@@ -8802,7 +8977,7 @@
       {
         "event": {
           "delta": {
-            "text": "lama2 model.\n2.  Load the Llama2 model",
+            "text": "\nprint(lora_model.layers[0].attn)\n```\n\n",
             "type": "text"
           },
           "event_type": {
@@ -8817,7 +8992,7 @@
       {
         "event": {
           "delta": {
-            "text": " and specify which layers to apply LoRA to.\n3. ",
+            "text": "This code loads the Llama2 model and applies LoRA to the",
             "type": "text"
           },
           "event_type": {
@@ -8832,7 +9007,7 @@
       {
         "event": {
           "delta": {
-            "text": " Define the LoRA parameters, such as the rank and alpha values",
+            "text": " query and value projection layers. You can modify the",
             "type": "text"
           },
           "event_type": {
@@ -8847,7 +9022,7 @@
       {
         "event": {
           "delta": {
-            "text": ".\n4.  Train the model using the LoRA fine-t",
+            "text": " `",
             "type": "text"
           },
           "event_type": {
@@ -8862,7 +9037,7 @@
       {
         "event": {
           "delta": {
-            "text": "uning recipe in torchtune.\n\nHere is",
+            "text": "lora_attn_modules` parameter to apply LoRA to",
             "type": "text"
           },
           "event_type": {
@@ -8877,7 +9052,7 @@
       {
         "event": {
           "delta": {
-            "text": " an example of how to use",
+            "text": " different layers.\n\nTo train the model using the LoRA fine",
             "type": "text"
           },
           "event_type": {
@@ -8892,7 +9067,7 @@
       {
         "event": {
           "delta": {
-            "text": " LoRA with the Llama2 model",
+            "text": "-tuning recipe in torchtune, you can use the",
             "type": "text"
           },
           "event_type": {
@@ -8907,7 +9082,7 @@
       {
         "event": {
           "delta": {
-            "text": ":\n\n```python\nfrom torchtune.models.llama2 import",
+            "text": " following command:\n\n```bash\ntune run l",
             "type": "text"
           },
           "event_type": {
@@ -8922,7 +9097,7 @@
       {
         "event": {
           "delta": {
-            "text": " llama2_7b, lora_llama",
+            "text": "ora_finetune_single_device --config llama3/",
             "type": "text"
           },
           "event_type": {
@@ -8937,7 +9112,7 @@
       {
         "event": {
           "delta": {
-            "text": "2_7b\n\n# Build Llama",
+            "text": "8B_lora_single_device\n```\n\nThis will train the",
             "type": "text"
           },
           "event_type": {
@@ -8952,7 +9127,7 @@
       {
         "event": {
           "delta": {
-            "text": "2 without any LoRA layers\nbase_model = llama2_",
+            "text": " model for one epoch on a common instruct",
             "type": "text"
           },
           "event_type": {
@@ -8967,7 +9142,7 @@
       {
         "event": {
           "delta": {
-            "text": "7b()\n\n# The default settings for lora_llama2",
+            "text": " dataset. You can modify the command to change the training settings",
             "type": "text"
           },
           "event_type": {
@@ -8982,7 +9157,7 @@
       {
         "event": {
           "delta": {
-            "text": "_7b will match those for llama2_7",
+            "text": ", such as the number of epochs or the batch size.\n\nAfter",
             "type": "text"
           },
           "event_type": {
@@ -8997,7 +9172,7 @@
       {
         "event": {
           "delta": {
-            "text": "b\n# We just need to define which layers we want",
+            "text": " training, you can use the trained model for inference or further fine",
             "type": "text"
           },
           "event_type": {
@@ -9012,7 +9187,7 @@
       {
         "event": {
           "delta": {
-            "text": " LoRA applied to.\n# Within each self",
+            "text": "-tuning. You can load the model using the `",
             "type": "text"
           },
           "event_type": {
@@ -9027,7 +9202,7 @@
       {
         "event": {
           "delta": {
-            "text": "-attention, we can choose from [\"q_proj\", \"",
+            "text": "load_checkpoint` method and use it to make predictions or continue",
             "type": "text"
           },
           "event_type": {
@@ -9042,7 +9217,7 @@
       {
         "event": {
           "delta": {
-            "text": "k_proj\", \"v_proj\", and \"output_proj",
+            "text": " training.\n\n```",
             "type": "text"
           },
           "event_type": {
@@ -9057,27 +9232,35 @@
       {
         "event": {
           "delta": {
-            "text": "\"].\n# We can also set apply_lora_to_m",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "complete"
           },
           "logprobs": null,
-          "stop_reason": null
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "out_of_tokens"
+          }
         },
         "metrics": null
-      },
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:c4b2d\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:606ad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:e37c3\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:606ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:e37c3\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='You can use the following function call to answer the user\\'s question:\\n\\n{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters\": {\"query\": \"How to fine-tune a Llama2 model with LoRA in torchtune\"}}', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
       {
         "event": {
           "delta": {
-            "text": "lp=True or apply_lora_to_output=True",
+            "text": "",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "start"
           },
           "logprobs": null,
           "stop_reason": null
@@ -9087,7 +9270,7 @@
       {
         "event": {
           "delta": {
-            "text": " to apply LoRA to other",
+            "text": "{\"",
             "type": "text"
           },
           "event_type": {
@@ -9102,7 +9285,7 @@
       {
         "event": {
           "delta": {
-            "text": " linear\n# layers outside of the self-attention.\nl",
+            "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
             "type": "text"
           },
           "event_type": {
@@ -9117,7 +9300,7 @@
       {
         "event": {
           "delta": {
-            "text": "ora_model = lora_llama",
+            "text": "\": {\"query\": \"How to use LoRA\"}}",
             "type": "text"
           },
           "event_type": {
@@ -9132,14 +9315,65 @@
       {
         "event": {
           "delta": {
-            "text": "2_7b(lora_at",
-            "type": "text"
-          },
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "succeeded"
+            },
+            "tool_call": {
+              "arguments": {
+                "query": "How to use LoRA"
+              },
+              "call_id": "8b617e66-08b4-4e93-8219-29b8b84c4672",
+              "tool_name": "knowledge_search"
+            },
+            "type": "tool_call"
+          },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
             "value": "progress"
           },
           "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:c4b2d\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:606ad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:e37c3\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:606ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:e37c3\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
           "stop_reason": null
         },
         "metrics": null
@@ -9147,7 +9381,7 @@
       {
         "event": {
           "delta": {
-            "text": "tn_modules=[\"q_proj\", \"",
+            "text": "You",
             "type": "text"
           },
           "event_type": {
@@ -9162,7 +9396,7 @@
       {
         "event": {
           "delta": {
-            "text": "v_proj\"])\n\n# Print the first",
+            "text": " can use the following function call to answer the user's question:\n\n",
             "type": "text"
           },
           "event_type": {
@@ -9177,7 +9411,7 @@
       {
         "event": {
           "delta": {
-            "text": " layer's self-attention in the",
+            "text": "{\"type\": \"function\", \"name\": \"knowledge_search\",",
             "type": "text"
           },
           "event_type": {
@@ -9192,7 +9426,7 @@
       {
         "event": {
           "delta": {
-            "text": " usual Llama2 model\nprint",
+            "text": " \"parameters\": {\"query\": \"How to fine-tune a",
             "type": "text"
           },
           "event_type": {
@@ -9207,7 +9441,7 @@
       {
         "event": {
           "delta": {
-            "text": "(base_model.layers[0].attn)\n# Print the",
+            "text": " Llama2 model with LoRA in torchtune\"}}",
             "type": "text"
           },
           "event_type": {
@@ -9222,7 +9456,45 @@
       {
         "event": {
           "delta": {
-            "text": " same for Llama2 with LoRA",
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:f3963\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:0484f\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:e075f\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:0484f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:e075f\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'How to use LoRA in Torchtune'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:0484f\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:0484f\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:0484f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:0484f\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune\\'s LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we\\'ve loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\"\"\"\\n    {total_params} total params,\\n    {trainable_params}\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \"\"\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:0484f\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet\\'s run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=[\\'q_proj\\',\\'k_proj\\',\\'v_proj\\',\\'output_proj\\'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune\\'s :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA\\'s low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "To",
             "type": "text"
           },
           "event_type": {
@@ -9237,7 +9509,7 @@
       {
         "event": {
           "delta": {
-            "text": " weights\nprint(lora_model.layers[0].attn",
+            "text": " use LoRA in Torchtune, you can follow these steps",
             "type": "text"
           },
           "event_type": {
@@ -9252,7 +9524,7 @@
       {
         "event": {
           "delta": {
-            "text": ")\n```\n\nThis code will load the Llama2 model",
+            "text": ":\n\n1.  Install Torchtune and its dependencies.\n",
             "type": "text"
           },
           "event_type": {
@@ -9267,7 +9539,7 @@
       {
         "event": {
           "delta": {
-            "text": " and apply LoRA to the specified layers. The `l",
+            "text": "2.  Download the Llama2 weights and tokenizer.\n3",
             "type": "text"
           },
           "event_type": {
@@ -9282,7 +9554,7 @@
       {
         "event": {
           "delta": {
-            "text": "ora_attn_modules` parameter is used to specify which layers",
+            "text": ".  Use",
             "type": "text"
           },
           "event_type": {
@@ -9297,7 +9569,7 @@
       {
         "event": {
           "delta": {
-            "text": " to apply LoRA to, and the `apply_lora",
+            "text": " the `lora_llama2_7",
             "type": "text"
           },
           "event_type": {
@@ -9312,7 +9584,7 @@
       {
         "event": {
           "delta": {
-            "text": "_to_mlp` and `apply_lora_to_output`",
+            "text": "b` model in Torchtune, which applies",
             "type": "text"
           },
           "event_type": {
@@ -9327,7 +9599,7 @@
       {
         "event": {
           "delta": {
-            "text": " parameters can be used to apply LoRA",
+            "text": " LoRA to the Q and V projections by default",
             "type": "text"
           },
           "event_type": {
@@ -9342,7 +9614,7 @@
       {
         "event": {
           "delta": {
-            "text": " to other linear layers outside of the self",
+            "text": ".\n4.  Load the base model weights into the LoRA",
             "type": "text"
           },
           "event_type": {
@@ -9357,7 +9629,7 @@
       {
         "event": {
           "delta": {
-            "text": "-attention.\n\nYou can also use the",
+            "text": " model without any conversion necessary.\n5.",
             "type": "text"
           },
           "event_type": {
@@ -9372,7 +9644,7 @@
       {
         "event": {
           "delta": {
-            "text": " `tune run` command to fine-tune the model",
+            "text": "  Set only LoRA parameters",
             "type": "text"
           },
           "event_type": {
@@ -9387,7 +9659,7 @@
       {
         "event": {
           "delta": {
-            "text": " using the LoRA fine-tuning recipe in torchtune.",
+            "text": " to trainable.\n6.",
             "type": "text"
           },
           "event_type": {
@@ -9402,7 +9674,7 @@
       {
         "event": {
           "delta": {
-            "text": " For example:\n\n```bash\ntune",
+            "text": "  Run the LoRA fin",
             "type": "text"
           },
           "event_type": {
@@ -9417,7 +9689,7 @@
       {
         "event": {
           "delta": {
-            "text": " run lora_finetune_single_device --config llama3",
+            "text": "etuning recipe in Torcht",
             "type": "text"
           },
           "event_type": {
@@ -9432,7 +9704,7 @@
       {
         "event": {
           "delta": {
-            "text": "/8B_lora_single_device\n``",
+            "text": "une with the desired configuration.\n\nYou can also experiment",
             "type": "text"
           },
           "event_type": {
@@ -9447,7 +9719,7 @@
       {
         "event": {
           "delta": {
-            "text": "`\n\nThis will run the LoRA fine-tuning recipe on the",
+            "text": " with different Lo",
             "type": "text"
           },
           "event_type": {
@@ -9462,7 +9734,7 @@
       {
         "event": {
           "delta": {
-            "text": " Llama3-8B-Instruct model using the default configuration",
+            "text": "RA configurations, such as applying LoRA to",
             "type": "text"
           },
           "event_type": {
@@ -9477,7 +9749,7 @@
       {
         "event": {
           "delta": {
-            "text": ". You can modify the configuration by adding command-line overrides, such",
+            "text": " all linear layers in the self-attention, increasing the rank,",
             "type": "text"
           },
           "event_type": {
@@ -9492,7 +9764,7 @@
       {
         "event": {
           "delta": {
-            "text": " as:\n\n```bash\ntune run",
+            "text": " or scaling alpha and rank together.",
             "type": "text"
           },
           "event_type": {
@@ -9517,7 +9789,7 @@
           "logprobs": null,
           "stop_reason": {
             "__enum__": "StopReason",
-            "value": "out_of_tokens"
+            "value": "end_of_turn"
           }
         },
         "metrics": null
@@ -9525,7 +9797,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:f4fd3\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:cbc88\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:8892b\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:cbc88\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:8892b\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='You can use the following function call to answer the user\\'s question:\\n\\n{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters\": {\"query\": \"How to fine-tune a Llama2 model with LoRA in torchtune\"}}', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:f3963\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:0484f\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:e075f\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:0484f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:e075f\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -9560,7 +9832,7 @@
       {
         "event": {
           "delta": {
-            "text": "type\": \"function\", \"",
+            "text": "type\": \"function\", \"name",
             "type": "text"
           },
           "event_type": {
@@ -9575,7 +9847,7 @@
       {
         "event": {
           "delta": {
-            "text": "name\": \"knowledge_search\", \"parameters\":",
+            "text": "\": \"knowledge_search\", \"parameters\":",
             "type": "text"
           },
           "event_type": {
@@ -9590,7 +9862,22 @@
       {
         "event": {
           "delta": {
-            "text": " {\"query\": \"How to use LoRA\"}}",
+            "text": " {\"query\": \"How to use Lo",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "RA in Torchtune\"}}",
             "type": "text"
           },
           "event_type": {
@@ -9611,9 +9898,9 @@
             },
             "tool_call": {
               "arguments": {
-                "query": "How to use LoRA"
+                "query": "How to use LoRA in Torchtune"
               },
-              "call_id": "64448cc3-c11a-4bae-bdcc-e5b8d13b888f",
+              "call_id": "42e1de09-f47e-44b0-9331-9b878556970d",
               "tool_name": "knowledge_search"
             },
             "type": "tool_call"
@@ -9651,7 +9938,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:f4fd3\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:cbc88\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:8892b\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:cbc88\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:8892b\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:f3963\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:0484f\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:e075f\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:0484f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:e075f\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -9671,37 +9958,7 @@
       {
         "event": {
           "delta": {
-            "text": "You",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " can use the following function call to answer",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the user's question:\n\n{\"type\": \"function\", \"",
+            "text": "I",
             "type": "text"
           },
           "event_type": {
@@ -9716,7 +9973,7 @@
       {
         "event": {
           "delta": {
-            "text": "name\": \"knowledge_search\", \"parameters\":",
+            "text": "'m ready to help you answer questions about",
             "type": "text"
           },
           "event_type": {
@@ -9731,7 +9988,7 @@
       {
         "event": {
           "delta": {
-            "text": " {\"query\": \"How to fine-tune a Llama2",
+            "text": " Torchtune based on the documentation you",
             "type": "text"
           },
           "event_type": {
@@ -9746,7 +10003,7 @@
       {
         "event": {
           "delta": {
-            "text": " model with LoRA in torchtune\"}}",
+            "text": " provided. What's your first question?",
             "type": "text"
           },
           "event_type": {
@@ -9779,7 +10036,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:f4fd3\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:cbc88\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:8892b\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:cbc88\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:8892b\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='You can use the following function call to answer the user\\'s question:\\n\\n{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters\": {\"query\": \"How to fine-tune a Llama2 model with LoRA in torchtune\"}}', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'How to use LoRA'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:cbc88\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:cbc88\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:8892b\\nContent:  with training with LoRA quickly,\\njust specify any config with ``_lora`` in its name, e.g:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n\\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\\nwhich linear layers LoRA should be applied to in the model:\\n\\n* ``lora_attn_modules: List[str]`` accepts a list of strings specifying which layers of the model to apply\\n  LoRA to:\\n\\n  * ``q_proj`` applies LoRA to the query projection layer.\\n  * ``k_proj`` applies LoRA to the key projection layer.\\n  * ``v_proj`` applies LoRA to the value projection layer.\\n  * ``output_proj`` applies LoRA to the attention output projection layer.\\n\\n  Whilst adding more layers to be fine-tuned may improve model accuracy,\\n  this will come at the cost of increased memory usage and reduced training speed.\\n\\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\\n* ``apply_lora_to_output: Bool`` applies LoRA to the model\\'s final output projection.\\n  This is usually a projection to vocabulary space (e.g. in language models), but\\n  other modelling tasks may have different projections - classifier models will project\\n  to the number of classes, for example\\n\\n.. note::\\n\\n  Models which use tied embeddings (such as Gemma and Qwen2 1.5B and 0.5B) for the\\n  final output projection do not support ``apply_lora_to_output``.\\n\\nThese are all specified under the ``model`` flag or config entry, i.e:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"]\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.llama3.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    model.lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\\n\\nSecondly, parameters which control the scale of the impact of LoRA on the model:\\n\\n* ``lora_rank: int`` affects the scale of\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:cbc88\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:9dcb7\\nContent: ora_finetune_label>`.\\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial <qlora_finetune_label>`.\\n\\nLet\\'s take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n.. note::\\n    To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\\n\\nWe can also add :ref:`command-line overrides <cli_override>` as needed, e.g.\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n        checkpointer.checkpoint_dir=<checkpoint_dir> \\\\\\n        tokenizer.path=<checkpoint_dir>/tokenizer.model \\\\\\n        checkpointer.output_dir=<checkpoint_dir>\\n\\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ``<checkpoint_dir>`` used in the :ref:`tune download <tune_download_label>` command above,\\nthen save a final checkpoint in the same directory following the original format. For more details on the\\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive <understand_checkpointer>`.\\n\\n.. note::\\n    To see the full set of configurable parameters for this (and other) configs we can use :ref:`tune cp <tune_cp_cli_label>` to copy (and modify)\\n    the default config. :ref:`tune cp <tune_cp_cli_label>` can be used with recipe scripts too, in case you want to make more custom changes\\n    that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp <tune_cp_cli_label>` see the section on\\n    :ref:`modifying configs <tune_cp_label>` in our \":ref:`finetune_llama_label`\" tutorial.\\n\\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights\\nwill\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -9799,12 +10056,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
+            "text": "To",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -9818,12 +10071,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search",
-            "type": "tool_call"
+            "text": " use LoRA, you can follow these steps:\n\n1.",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -9837,12 +10086,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\", \"parameters\": {\"query\": \"",
-            "type": "tool_call"
+            "text": "  Install the necessary packages, including torchtune and the L",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -9856,12 +10101,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "Torchtune documentation\"}}",
-            "type": "tool_call"
+            "text": "lama2 model.\n2.  Load the Llama2 model",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -9875,63 +10116,42 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "Torchtune documentation"
-              },
-              "call_id": "6ec2bf0f-42f3-453d-ad5f-52bc6e0267b7",
-              "tool_name": "knowledge_search"
-            },
-            "type": "tool_call"
+            "text": " and specify which layers to apply LoRA to.\n3. ",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
             "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
+          "stop_reason": null
         },
         "metrics": null
       },
       {
         "event": {
           "delta": {
-            "text": "",
+            "text": " Define the LoRA parameters, such as the rank and alpha values",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
+            "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
+          "stop_reason": null
         },
         "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Instead of the standard multi-head attention, what attention type does Llama3-8B use?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Llama3-8B attention type'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:num-1\\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:num-1\\nContent:  instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\\n\\n\\n.. code-block:: bash\\n\\n    tune download meta-llama/Meta-Llama-3\\n\"), TextContentItem(type='text', text=\"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights <llama3_label>`\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n    <s>[INST] <<SYS>>\\n    You are a helpful, respectful, and honest assistant.\\n    <</SYS>>\\n\\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\\n\\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\\n\"), TextContentItem(type='text', text='Result 4:\\nDocument_id:num-0\\nContent: \\'m Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let\\'s walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n    The Llama3 Base model uses a `different prompt template\\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n    Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet\\'s say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n    sample = [\\n        {\\n            \"role\": \"system\",\\n            \"\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:num-3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='insert_into_memory', description='Insert documents into memory', parameters={}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
+      },
       {
         "event": {
           "delta": {
-            "text": "",
+            "text": ".\n4.  Train the model using the LoRA fine-t",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
+            "value": "progress"
           },
           "logprobs": null,
           "stop_reason": null
@@ -9941,7 +10161,7 @@
       {
         "event": {
           "delta": {
-            "text": "L",
+            "text": "uning recipe in torchtune.\n\nHere is",
             "type": "text"
           },
           "event_type": {
@@ -9956,7 +10176,7 @@
       {
         "event": {
           "delta": {
-            "text": "lama3-8B uses grouped-query attention instead of the standard multi-head",
+            "text": " an example of how to use",
             "type": "text"
           },
           "event_type": {
@@ -9971,7 +10191,7 @@
       {
         "event": {
           "delta": {
-            "text": " attention from Llama2-7B.",
+            "text": " LoRA with the Llama2 model",
             "type": "text"
           },
           "event_type": {
@@ -9986,12 +10206,1008 @@
       {
         "event": {
           "delta": {
-            "text": "",
+            "text": ":\n\n```python\nfrom torchtune.models.llama2 import",
             "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " llama2_7b, lora_llama",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "2_7b\n\n# Build Llama",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "2 without any LoRA layers\nbase_model = llama2_",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "7b()\n\n# The default settings for lora_llama2",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "_7b will match those for llama2_7",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "b\n# We just need to define which layers we want",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " LoRA applied to.\n# Within each self",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "-attention, we can choose from [\"q_proj\", \"",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "k_proj\", \"v_proj\", and \"output_proj",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "\"].\n# We can also set apply_lora_to_m",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "lp=True or apply_lora_to_output=True",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " to apply LoRA to other",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " linear\n# layers outside of the self-attention.\nl",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "ora_model = lora_llama",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "2_7b(lora_at",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "tn_modules=[\"q_proj\", \"",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "v_proj\"])\n\n# Print the first",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " layer's self-attention in the",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " usual Llama2 model\nprint",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "(base_model.layers[0].attn)\n# Print the",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " same for Llama2 with LoRA",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " weights\nprint(lora_model.layers[0].attn",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": ")\n```\n\nThis code will load the Llama2 model",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " and apply LoRA to the specified layers. The `l",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "ora_attn_modules` parameter is used to specify which layers",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " to apply LoRA to, and the `apply_lora",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "_to_mlp` and `apply_lora_to_output`",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " parameters can be used to apply LoRA",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " to other linear layers outside of the self",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "-attention.\n\nYou can also use the",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " `tune run` command to fine-tune the model",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " using the LoRA fine-tuning recipe in torchtune.",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " For example:\n\n```bash\ntune",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " run lora_finetune_single_device --config llama3",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "/8B_lora_single_device\n``",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "`\n\nThis will run the LoRA fine-tuning recipe on the",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " Llama3-8B-Instruct model using the default configuration",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": ". You can modify the configuration by adding command-line overrides, such",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " as:\n\n```bash\ntune run",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "out_of_tokens"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:f4fd3\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:cbc88\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:8892b\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:cbc88\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:8892b\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='You can use the following function call to answer the user\\'s question:\\n\\n{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters\": {\"query\": \"How to fine-tune a Llama2 model with LoRA in torchtune\"}}', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "{\"",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "type\": \"function\", \"",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "name\": \"knowledge_search\", \"parameters\":",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " {\"query\": \"How to use LoRA\"}}",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "succeeded"
+            },
+            "tool_call": {
+              "arguments": {
+                "query": "How to use LoRA"
+              },
+              "call_id": "64448cc3-c11a-4bae-bdcc-e5b8d13b888f",
+              "tool_name": "knowledge_search"
+            },
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:f4fd3\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:cbc88\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:8892b\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:cbc88\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:8892b\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "You",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " can use the following function call to answer",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " the user's question:\n\n{\"type\": \"function\", \"",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "name\": \"knowledge_search\", \"parameters\":",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " {\"query\": \"How to fine-tune a Llama2",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " model with LoRA in torchtune\"}}",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "started"
+            },
+            "tool_call": "",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "\", \"parameters\": {\"query\": \"Torchtune",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " documentation\"}}",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "succeeded"
+            },
+            "tool_call": {
+              "arguments": {
+                "query": "Torchtune documentation"
+              },
+              "call_id": "0f0eb27a-1126-4d26-8b33-b630a9518093",
+              "tool_name": "knowledge_search"
+            },
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
           },
           "logprobs": null,
           "stop_reason": {
@@ -10004,7 +11220,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Instead of the standard multi-head attention, what attention type does Llama3-8B use?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Llama3-8B attention type'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:num-1\\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:num-1\\nContent:  instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\\n\\n\\n.. code-block:: bash\\n\\n    tune download meta-llama/Meta-Llama-3\\n\"), TextContentItem(type='text', text=\"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights <llama3_label>`\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n    <s>[INST] <<SYS>>\\n    You are a helpful, respectful, and honest assistant.\\n    <</SYS>>\\n\\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\\n\\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\\n\"), TextContentItem(type='text', text='Result 4:\\nDocument_id:num-0\\nContent: \\'m Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let\\'s walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n    The Llama3 Base model uses a `different prompt template\\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n    Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet\\'s say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n    sample = [\\n        {\\n            \"role\": \"system\",\\n            \"\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:num-3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Instead of the standard multi-head attention, what attention type does Llama3-8B use?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Llama3-8B attention type'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:num-1\\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:num-1\\nContent:  instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\\n\\n\\n.. code-block:: bash\\n\\n    tune download meta-llama/Meta-Llama-3\\n\"), TextContentItem(type='text', text=\"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights <llama3_label>`\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n    <s>[INST] <<SYS>>\\n    You are a helpful, respectful, and honest assistant.\\n    <</SYS>>\\n\\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\\n\\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\\n\"), TextContentItem(type='text', text='Result 4:\\nDocument_id:num-0\\nContent: \\'m Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let\\'s walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n    The Llama3 Base model uses a `different prompt template\\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n    Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet\\'s say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n    sample = [\\n        {\\n            \"role\": \"system\",\\n            \"\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:num-3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='insert_into_memory', description='Insert documents into memory', parameters={}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -10039,7 +11255,22 @@
       {
         "event": {
           "delta": {
-            "text": " attention type used by Llama3-8B is grouped-query attention.",
+            "text": " attention type used by Llama3-8B is grouped",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "-query attention.",
             "type": "text"
           },
           "event_type": {
@@ -10072,7 +11303,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Instead of the standard multi-head attention, what attention type does Llama3-8B use?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='insert_into_memory', description='Insert documents into memory', parameters={}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Instead of the standard multi-head attention, what attention type does Llama3-8B use?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Llama3-8B attention type'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:num-1\\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:num-1\\nContent:  instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\\n\\n\\n.. code-block:: bash\\n\\n    tune download meta-llama/Meta-Llama-3\\n\"), TextContentItem(type='text', text=\"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights <llama3_label>`\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n    <s>[INST] <<SYS>>\\n    You are a helpful, respectful, and honest assistant.\\n    <</SYS>>\\n\\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\\n\\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\\n\"), TextContentItem(type='text', text='Result 4:\\nDocument_id:num-0\\nContent: \\'m Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let\\'s walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n    The Llama3 Base model uses a `different prompt template\\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n    Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet\\'s say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n    sample = [\\n        {\\n            \"role\": \"system\",\\n            \"\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:num-3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -10092,7 +11323,22 @@
       {
         "event": {
           "delta": {
-            "text": "{\n",
+            "text": "The",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " attention type used by Llama3-",
             "type": "text"
           },
           "event_type": {
@@ -10107,7 +11353,7 @@
       {
         "event": {
           "delta": {
-            "text": "    \"type\": \"function\",\n   ",
+            "text": "8B is grouped-query attention.",
             "type": "text"
           },
           "event_type": {
@@ -10122,7 +11368,45 @@
       {
         "event": {
           "delta": {
-            "text": " \"name\": \"knowledge_search\",\n    \"parameters\": {\n        \"",
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Instead of the standard multi-head attention, what attention type does Llama3-8B use?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='insert_into_memory', description='Insert documents into memory', parameters={}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "{\n",
             "type": "text"
           },
           "event_type": {
@@ -10137,7 +11421,7 @@
       {
         "event": {
           "delta": {
-            "text": "query\": \"Llama3",
+            "text": "    \"type\": \"function\",\n    \"name\": \"knowledge",
             "type": "text"
           },
           "event_type": {
@@ -10152,7 +11436,7 @@
       {
         "event": {
           "delta": {
-            "text": "-8B attention type\"\n    }\n",
+            "text": "_search\",\n    \"parameters\": {\n        \"query\": \"L",
             "type": "text"
           },
           "event_type": {
@@ -10167,7 +11451,7 @@
       {
         "event": {
           "delta": {
-            "text": "}",
+            "text": "lama3-8B attention type\"\n    }\n}",
             "type": "text"
           },
           "event_type": {
@@ -10190,7 +11474,7 @@
               "arguments": {
                 "query": "Llama3-8B attention type"
               },
-              "call_id": "95471ab3-196c-45ba-a7f1-7585026662c2",
+              "call_id": "ce62cb6d-fcb0-437a-abd9-b0bed88628ed",
               "tool_name": "knowledge_search"
             },
             "type": "tool_call"
@@ -10271,7 +11555,26 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"",
+            "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " \"parameters\": {\"query\": \"L",
             "type": "tool_call"
           },
           "event_type": {
@@ -10290,7 +11593,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "parameters\": {\"query\": \"Llama3-8B attention type\"}}",
+            "tool_call": "lama3-8B attention type\"}}",
             "type": "tool_call"
           },
           "event_type": {
@@ -10313,7 +11616,7 @@
               "arguments": {
                 "query": "Llama3-8B attention type"
               },
-              "call_id": "f026154f-72fb-47aa-828c-065bd5a16267",
+              "call_id": "25fcc4f2-72a8-4175-82ca-c7a692d13d66",
               "tool_name": "knowledge_search"
             },
             "type": "tool_call"
@@ -10613,7 +11916,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "brave_search.call(query=\"current CEO of",
+            "tool_call": "brave_search.call(query=\"current",
             "type": "tool_call"
           },
           "event_type": {
@@ -10632,7 +11935,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " Meta\")",
+            "tool_call": " CEO of Meta\")",
             "type": "tool_call"
           },
           "event_type": {
@@ -10655,7 +11958,7 @@
               "arguments": {
                 "query": "current CEO of Meta"
               },
-              "call_id": "b9ee4732-1663-429c-ae7d-186578174556",
+              "call_id": "f5d644f1-3ada-4a5a-a088-736c89428fe9",
               "tool_name": {
                 "__enum__": "BuiltinTool",
                 "value": "brave_search"
@@ -10829,7 +12132,22 @@
       {
         "event": {
           "delta": {
-            "text": " function `get_boiling_point` is not able to find",
+            "text": " function `get_boiling_point` is",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " not able to find the boiling point of",
             "type": "text"
           },
           "event_type": {
@@ -10844,7 +12162,7 @@
       {
         "event": {
           "delta": {
-            "text": " the boiling point of polyjuice as it is a fictional",
+            "text": " polyjuice as it is a fictional",
             "type": "text"
           },
           "event_type": {
@@ -10859,7 +12177,7 @@
       {
         "event": {
           "delta": {
-            "text": " liquid from the Harry Potter series. The",
+            "text": " liquid from the Harry Potter series. The function is only able",
             "type": "text"
           },
           "event_type": {
@@ -10874,7 +12192,7 @@
       {
         "event": {
           "delta": {
-            "text": " function only works with real-world liquids.",
+            "text": " to find the boiling point of real liquids.",
             "type": "text"
           },
           "event_type": {
@@ -11070,7 +12388,7 @@
       {
         "event": {
           "delta": {
-            "text": " function `get_boiling_point` is not",
+            "text": " function `get_boiling_point`",
             "type": "text"
           },
           "event_type": {
@@ -11085,7 +12403,7 @@
       {
         "event": {
           "delta": {
-            "text": " able to find the boiling point of polyjuice as it is",
+            "text": " is not able to find the boiling point of",
             "type": "text"
           },
           "event_type": {
@@ -11100,7 +12418,7 @@
       {
         "event": {
           "delta": {
-            "text": " not a real liquid. Polyjuice is a magical potion from",
+            "text": " polyjuice as it is not a",
             "type": "text"
           },
           "event_type": {
@@ -11115,7 +12433,7 @@
       {
         "event": {
           "delta": {
-            "text": " the Harry Potter series.",
+            "text": " real liquid.",
             "type": "text"
           },
           "event_type": {
@@ -11296,7 +12614,7 @@
       {
         "event": {
           "delta": {
-            "text": " function `get_boiling_point` is",
+            "text": " function `get_boiling_point` is not able",
             "type": "text"
           },
           "event_type": {
@@ -11311,7 +12629,7 @@
       {
         "event": {
           "delta": {
-            "text": " not able to find the boiling point of polyjuice as it",
+            "text": " to find the boiling point of polyju",
             "type": "text"
           },
           "event_type": {
@@ -11326,7 +12644,7 @@
       {
         "event": {
           "delta": {
-            "text": " is not a real liquid. Polyjuice is",
+            "text": "ice as it is not a real",
             "type": "text"
           },
           "event_type": {
@@ -11341,7 +12659,7 @@
       {
         "event": {
           "delta": {
-            "text": " a magical potion from the Harry Potter series.",
+            "text": " liquid.",
             "type": "text"
           },
           "event_type": {
@@ -11559,7 +12877,26 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "{\"type\": \"function\", \"name\":",
+            "tool_call": "{\"type\": \"function",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "\", \"name\": \"get_boiling_point\",",
             "type": "tool_call"
           },
           "event_type": {
@@ -11578,7 +12915,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " \"get_boiling_point\", \"parameters\":",
+            "tool_call": " \"parameters\": {\"liquid_name\": \"",
             "type": "tool_call"
           },
           "event_type": {
@@ -11597,7 +12934,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " {\"liquid_name\": \"polyjuice\"}}",
+            "tool_call": "polyjuice\"}}",
             "type": "tool_call"
           },
           "event_type": {
@@ -11620,7 +12957,7 @@
               "arguments": {
                 "liquid_name": "polyjuice"
               },
-              "call_id": "a994859b-38d2-45d5-913e-359409ee8ae2",
+              "call_id": "22050f4b-36df-48fb-ac11-e3a47fa0beaf",
               "tool_name": "get_boiling_point"
             },
             "type": "tool_call"
@@ -11843,7 +13180,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
+            "tool_call": "{\"type\": \"function\", \"name",
             "type": "tool_call"
           },
           "event_type": {
@@ -11862,7 +13199,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"polyjuice",
+            "tool_call": "\": \"get_boiling_point\", \"parameters",
             "type": "tool_call"
           },
           "event_type": {
@@ -11881,7 +13218,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "\"}}",
+            "tool_call": "\": {\"liquid_name\": \"polyjuice\"}}",
             "type": "tool_call"
           },
           "event_type": {
@@ -11904,7 +13241,7 @@
               "arguments": {
                 "liquid_name": "polyjuice"
               },
-              "call_id": "e48d4312-1a88-4759-9b9c-bc573c23fee6",
+              "call_id": "11302682-7a3a-45f3-955b-6709444fd626",
               "tool_name": "get_boiling_point"
             },
             "type": "tool_call"
@@ -12120,7 +13457,7 @@
       {
         "event": {
           "delta": {
-            "text": " couldn't find any information on the boiling point of Poly",
+            "text": " couldn't find any information on the boiling point",
             "type": "text"
           },
           "event_type": {
@@ -12135,7 +13472,7 @@
       {
         "event": {
           "delta": {
-            "text": "juice. Polyjuice is a magical potion in",
+            "text": " of Polyjuice. Polyjuice is a magical potion in the",
             "type": "text"
           },
           "event_type": {
@@ -12150,7 +13487,7 @@
       {
         "event": {
           "delta": {
-            "text": " the Harry Potter series that allows the drinker",
+            "text": " Harry Potter series that allows the drinker to transform into someone else. It's",
             "type": "text"
           },
           "event_type": {
@@ -12165,7 +13502,7 @@
       {
         "event": {
           "delta": {
-            "text": " to transform into someone else. It's not a physical substance",
+            "text": " not a physical substance with a boiling point. If",
             "type": "text"
           },
           "event_type": {
@@ -12180,7 +13517,7 @@
       {
         "event": {
           "delta": {
-            "text": " with a boiling point. If you have any other questions, I'd",
+            "text": " you have any other questions, I'd be",
             "type": "text"
           },
           "event_type": {
@@ -12195,7 +13532,7 @@
       {
         "event": {
           "delta": {
-            "text": " be happy to help.",
+            "text": " happy to help.",
             "type": "text"
           },
           "event_type": {
@@ -12413,7 +13750,26 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point\",",
+            "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "_point\", \"parameters\": {\"liquid_name",
             "type": "tool_call"
           },
           "event_type": {
@@ -12432,7 +13788,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " \"parameters\": {\"liquid_name\": \"polyjuice\"}}",
+            "tool_call": "\": \"polyjuice\"}}",
             "type": "tool_call"
           },
           "event_type": {
@@ -12455,7 +13811,7 @@
               "arguments": {
                 "liquid_name": "polyjuice"
               },
-              "call_id": "cd0e926b-b1c8-468b-8c55-b3e42e7ae89d",
+              "call_id": "e704d0f9-45a1-4ed1-90b0-8a05c504da6c",
               "tool_name": "get_boiling_point"
             },
             "type": "tool_call"
@@ -12503,22 +13859,7 @@
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
+            "value": "start"
           },
           "logprobs": null,
           "stop_reason": null
@@ -12528,7 +13869,7 @@
       {
         "event": {
           "delta": {
-            "text": " 100th prime number is ",
+            "text": "The",
             "type": "text"
           },
           "event_type": {
@@ -12543,7 +13884,7 @@
       {
         "event": {
           "delta": {
-            "text": "541.",
+            "text": " 100th prime number is 541.",
             "type": "text"
           },
           "event_type": {
@@ -12619,7 +13960,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "def is_prime(n):\n    if n <= 1:\n       ",
+            "tool_call": "def is_prime(n):\n    if n",
             "type": "tool_call"
           },
           "event_type": {
@@ -12638,7 +13979,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " return False\n    if n <= 3:\n        return True",
+            "tool_call": " <= 1:\n        return False\n    if n <= 3:\n        return",
             "type": "tool_call"
           },
           "event_type": {
@@ -12657,7 +13998,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "\n    if n % 2 ==",
+            "tool_call": " True\n    if n % 2 == 0 or n",
             "type": "tool_call"
           },
           "event_type": {
@@ -12676,7 +14017,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " 0 or n % 3 == 0:\n       ",
+            "tool_call": " % 3 == 0:\n       ",
             "type": "tool_call"
           },
           "event_type": {
@@ -12695,7 +14036,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " return False\n    i = 5\n",
+            "tool_call": " return False\n    i = 5\n    while i *",
             "type": "tool_call"
           },
           "event_type": {
@@ -12714,7 +14055,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "    while i * i <= n:\n        if n % i",
+            "tool_call": " i <= n:\n        if n % i",
             "type": "tool_call"
           },
           "event_type": {
@@ -12733,7 +14074,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " == 0 or n % (i + 2) ==",
+            "tool_call": " == 0 or n % (i + 2",
             "type": "tool_call"
           },
           "event_type": {
@@ -12752,7 +14093,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " 0:\n            return False\n        i += 6\n",
+            "tool_call": ") == 0:\n            return False",
             "type": "tool_call"
           },
           "event_type": {
@@ -12771,7 +14112,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "    return True\n\ndef get_nth_prime(n):\n    count =",
+            "tool_call": "\n        i += 6\n   ",
             "type": "tool_call"
           },
           "event_type": {
@@ -12790,7 +14131,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " 0\n    num = 2\n    while True:\n",
+            "tool_call": " return True\n\ndef get_nth_prime(n):\n    count = ",
             "type": "tool_call"
           },
           "event_type": {
@@ -12809,7 +14150,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "        if is_prime(num):\n            count += 1\n           ",
+            "tool_call": "0\n    num = 2\n   ",
             "type": "tool_call"
           },
           "event_type": {
@@ -12828,7 +14169,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " if count == n:\n                return num\n        num +=",
+            "tool_call": " while True:\n        if is_prime(num):\n            count += ",
             "type": "tool_call"
           },
           "event_type": {
@@ -12847,7 +14188,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " 1\n\nprint(get_nth_prime(",
+            "tool_call": "1\n            if count == n:\n                return num\n       ",
             "type": "tool_call"
           },
           "event_type": {
@@ -12866,7 +14207,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "100))",
+            "tool_call": " num += 1\n\nprint(get_nth_prime(100))",
             "type": "tool_call"
           },
           "event_type": {
@@ -12889,7 +14230,7 @@
               "arguments": {
                 "code": "def is_prime(n):\n    if n <= 1:\n        return False\n    if n <= 3:\n        return True\n    if n % 2 == 0 or n % 3 == 0:\n        return False\n    i = 5\n    while i * i <= n:\n        if n % i == 0 or n % (i + 2) == 0:\n            return False\n        i += 6\n    return True\n\ndef get_nth_prime(n):\n    count = 0\n    num = 2\n    while True:\n        if is_prime(num):\n            count += 1\n            if count == n:\n                return num\n        num += 1\n\nprint(get_nth_prime(100))"
               },
-              "call_id": "a184cbe8-b941-472d-9254-fda5ed8d770f",
+              "call_id": "6d57c323-7679-447f-9928-ccab76c0bdc9",
               "tool_name": {
                 "__enum__": "BuiltinTool",
                 "value": "code_interpreter"
@@ -12965,7 +14306,22 @@
       {
         "event": {
           "delta": {
-            "text": "plexity the company was founded in 2022.",
+            "text": "plexity the company was founded in 202",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "2.",
             "type": "text"
           },
           "event_type": {
@@ -13101,8 +14457,253 @@
       {
         "event": {
           "delta": {
-            "text": "type\": \"function\", \"name\": \"",
-            "type": "text"
+            "text": "type\": \"function\", \"name\": \"knowledge_search\", \"",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "parameters\": {\"query\": \"Perplexity company founding date\"}}",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "succeeded"
+            },
+            "tool_call": {
+              "arguments": {
+                "query": "Perplexity company founding date"
+              },
+              "call_id": "22d5440e-2873-4956-a81f-f114fc78671d",
+              "tool_name": "knowledge_search"
+            },
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was Perplexity the company founded?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Perplexity company founding date'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "{\"",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "type\": \"function\", \"name\": \"knowledge_search\",",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " \"parameters\": {\"query\":",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " \"Perplexity company founding date\"}}",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "succeeded"
+            },
+            "tool_call": {
+              "arguments": {
+                "query": "Perplexity company founding date"
+              },
+              "call_id": "e4a5ff1d-ac00-4e0a-b93b-17e19fa3bc55",
+              "tool_name": "knowledge_search"
+            },
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was Perplexity the company founded?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "started"
+            },
+            "tool_call": "",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "{\"type\": \"function\", \"name",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -13116,8 +14717,12 @@
       {
         "event": {
           "delta": {
-            "text": "knowledge_search\", \"parameters\":",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "\": \"knowledge_search\", \"parameters\": {\"",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -13131,8 +14736,12 @@
       {
         "event": {
           "delta": {
-            "text": " {\"query\": \"Perplexity company founding date\"}}",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "query\": \"Perplexity company founding date\"}}",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -13154,7 +14763,7 @@
               "arguments": {
                 "query": "Perplexity company founding date"
               },
-              "call_id": "9ad1f31d-4fb3-40e6-8037-0cc50794d6ce",
+              "call_id": "98d3790b-1b84-4ab7-ad66-117fea68d5db",
               "tool_name": "knowledge_search"
             },
             "type": "tool_call"
@@ -13192,7 +14801,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was Perplexity the company founded?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Perplexity company founding date'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was Perplexity the company founded?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -13212,8 +14821,12 @@
       {
         "event": {
           "delta": {
-            "text": "{\"",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "started"
+            },
+            "tool_call": "",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -13227,8 +14840,12 @@
       {
         "event": {
           "delta": {
-            "text": "type\": \"function\", \"name\": \"knowledge_search\",",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -13242,8 +14859,12 @@
       {
         "event": {
           "delta": {
-            "text": " \"parameters\": {\"query\":",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " \"parameters\": {\"query\": \"Perplexity company founding",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -13257,8 +14878,12 @@
       {
         "event": {
           "delta": {
-            "text": " \"Perplexity company founding date\"}}",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " date\"}}",
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -13280,7 +14905,7 @@
               "arguments": {
                 "query": "Perplexity company founding date"
               },
-              "call_id": "e4a5ff1d-ac00-4e0a-b93b-17e19fa3bc55",
+              "call_id": "6add8292-f388-4ec5-8ec5-5071c9397492",
               "tool_name": "knowledge_search"
             },
             "type": "tool_call"
@@ -13318,7 +14943,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was Perplexity the company founded?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was the nba created?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'NBA creation date'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -13338,12 +14963,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
+            "text": "The",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -13357,12 +14978,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
-            "type": "tool_call"
+            "text": " NBA was created on August ",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -13376,12 +14993,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\": {\"query\": \"Perplexity company founding date\"}}",
-            "type": "tool_call"
+            "text": "3, 1949, with",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -13395,28 +15008,45 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "Perplexity company founding date"
-              },
-              "call_id": "11c1dca5-6754-4ba6-8337-1bb8a538342f",
-              "tool_name": "knowledge_search"
-            },
-            "type": "tool_call"
+            "text": " the merger of the Basketball Association of",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
             "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " America (BAA) and the National Basketball League",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " (NBL).",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
         },
         "metrics": null
       },
@@ -13441,7 +15071,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was Perplexity the company founded?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was the nba created?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'NBA creation date'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -13461,12 +15091,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
+            "text": "The",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -13480,12 +15106,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",",
-            "type": "tool_call"
+            "text": " NBA was created on August 3, ",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -13499,12 +15121,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " \"parameters\": {\"query\": \"Perplexity company founding",
-            "type": "tool_call"
+            "text": "1949, with the merger of the",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -13518,12 +15136,8 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " date\"}}",
-            "type": "tool_call"
+            "text": " Basketball Association of America (BAA) and the National Basketball",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
@@ -13537,28 +15151,15 @@
       {
         "event": {
           "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "Perplexity company founding date"
-              },
-              "call_id": "6add8292-f388-4ec5-8ec5-5071c9397492",
-              "tool_name": "knowledge_search"
-            },
-            "type": "tool_call"
+            "text": " League (NBL).",
+            "type": "text"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
             "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
+          "stop_reason": null
         },
         "metrics": null
       },
@@ -13583,7 +15184,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was the nba created?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'NBA creation date'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was the nba created?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'when was the nba created'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'when was the nba created'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -13618,7 +15219,22 @@
       {
         "event": {
           "delta": {
-            "text": " NBA was created on August 3, ",
+            "text": " NBA was created on August 3, 1949,",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " with the merger of the Basketball Association of America",
             "type": "text"
           },
           "event_type": {
@@ -13633,7 +15249,7 @@
       {
         "event": {
           "delta": {
-            "text": "1949, with the merger of the Basketball Association of America",
+            "text": " (BAA) and the National Basketball",
             "type": "text"
           },
           "event_type": {
@@ -13648,7 +15264,7 @@
       {
         "event": {
           "delta": {
-            "text": " (BAA) and the National Basketball League (NBL).",
+            "text": " League (NBL).",
             "type": "text"
           },
           "event_type": {
@@ -13681,7 +15297,7 @@
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was the nba created?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'NBA creation date'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was the nba created?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'when was the nba created'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
     "chunks": [
       {
         "event": {
@@ -13701,7 +15317,7 @@
       {
         "event": {
           "delta": {
-            "text": "The",
+            "text": "{\"",
             "type": "text"
           },
           "event_type": {
@@ -13716,7 +15332,7 @@
       {
         "event": {
           "delta": {
-            "text": " NBA was created on August 3, ",
+            "text": "type\": \"function\", \"name\":",
             "type": "text"
           },
           "event_type": {
@@ -13731,7 +15347,7 @@
       {
         "event": {
           "delta": {
-            "text": "1949, with the merger of the",
+            "text": " \"knowledge_search\", \"parameters\": {\"query",
             "type": "text"
           },
           "event_type": {
@@ -13746,7 +15362,7 @@
       {
         "event": {
           "delta": {
-            "text": " Basketball Association of America (BAA) and the National Basketball",
+            "text": "\": \"when was the nba created\"}}",
             "type": "text"
           },
           "event_type": {
@@ -13761,15 +15377,28 @@
       {
         "event": {
           "delta": {
-            "text": " League (NBL).",
-            "type": "text"
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "succeeded"
+            },
+            "tool_call": {
+              "arguments": {
+                "query": "when was the nba created"
+              },
+              "call_id": "c132966d-e4be-47de-9512-7e9e2e6d896c",
+              "tool_name": "knowledge_search"
+            },
+            "type": "tool_call"
           },
           "event_type": {
             "__enum__": "ChatCompletionResponseEventType",
             "value": "progress"
           },
           "logprobs": null,
-          "stop_reason": null
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
         },
         "metrics": null
       },
@@ -13837,7 +15466,45 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters\":",
+            "tool_call": "{\"type\": \"function\", \"name",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "\": \"knowledge_search\", \"parameters",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "\": {\"query\": \"when was",
             "type": "tool_call"
           },
           "event_type": {
@@ -13856,7 +15523,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " {\"query\": \"NBA creation date\"}}",
+            "tool_call": " the nba created\"}}",
             "type": "tool_call"
           },
           "event_type": {
@@ -13877,9 +15544,9 @@
             },
             "tool_call": {
               "arguments": {
-                "query": "NBA creation date"
+                "query": "when was the nba created"
               },
-              "call_id": "9ffcb7be-c9ba-478a-af1c-8f68d4033c4f",
+              "call_id": "0145ecf7-ff15-4e06-8684-d9c60e0e2966",
               "tool_name": "knowledge_search"
             },
             "type": "tool_call"
diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.pickle b/tests/integration/fixtures/recorded_responses/chat_completion.pickle
index c4f1c7efdc..eb7534e6a7 100644
Binary files a/tests/integration/fixtures/recorded_responses/chat_completion.pickle and b/tests/integration/fixtures/recorded_responses/chat_completion.pickle differ
diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.json b/tests/integration/fixtures/recorded_responses/invoke_tool.json
index 77995f72f3..2878275daf 100644
--- a/tests/integration/fixtures/recorded_responses/invoke_tool.json
+++ b/tests/integration/fixtures/recorded_responses/invoke_tool.json
@@ -1,4 +1,13 @@
 {
+  "()_[('kwargs', {'session_id': '<UUID>', 'code': \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert 'date' column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot the time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"}), ('tool_name', 'code_interpreter')]": {
+    "type": "value",
+    "value": {
+      "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+      "error_code": null,
+      "error_message": null,
+      "metadata": null
+    }
+  },
   "()_[('kwargs', {'session_id': '<UUID>', 'code': \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert date column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"}), ('tool_name', 'code_interpreter')]": {
     "type": "value",
     "value": {
@@ -80,6 +89,15 @@
       "metadata": null
     }
   },
+  "()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())'}), ('tool_name', 'code_interpreter')]": {
+    "type": "value",
+    "value": {
+      "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+      "error_code": null,
+      "error_message": null,
+      "metadata": null
+    }
+  },
   "()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Convert the \\'Year\\' column to datetime\\ndf[\\'Year\\'] = pd.to_datetime(df[\\'Year\\'], format=\\'%Y\\')\\n\\n# Group by \\'Year\\' and calculate the average inflation\\ndf_avg_inflation = df.groupby(\\'Year\\')[\\'Inflation\\'].mean().reset_index()\\n\\n# Plot the average inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation[\\'Year\\'], df_avg_inflation[\\'Inflation\\'], marker=\\'o\\')\\nplt.title(\\'Average Yearly Inflation\\')\\nplt.xlabel(\\'Year\\')\\nplt.ylabel(\\'Inflation\\')\\nplt.grid(True)\\nplt.show()'}), ('tool_name', 'code_interpreter')]": {
     "type": "value",
     "value": {
@@ -98,6 +116,52 @@
       "metadata": null
     }
   },
+  "()_[('kwargs', {'session_id': '<UUID>', 'query': 'How to use LoRA in Torchtune', 'vector_db_ids': ['vector_db_<UUID>']}), ('tool_name', 'knowledge_search')]": {
+    "type": "value",
+    "value": {
+      "content": [
+        {
+          "text": "knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n",
+          "type": "text"
+        },
+        {
+          "text": "Result 1:\nDocument_id:cc646\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+          "type": "text"
+        },
+        {
+          "text": "Result 2:\nDocument_id:cc646\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
+          "type": "text"
+        },
+        {
+          "text": "Result 3:\nDocument_id:cc646\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+          "type": "text"
+        },
+        {
+          "text": "Result 4:\nDocument_id:cc646\nContent:  from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n  # Assuming that base_model already has the pretrained Llama2 weights,\n  # this will directly load them into your LoRA model without any conversion necessary.\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n  # Fetch all params from the model that are associated with LoRA.\n  lora_params = get_adapter_params(lora_model)\n\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n  set_trainable_params(lora_model, lora_params)\n\n  # Print the total number of parameters\n  total_params = sum([p.numel() for p in lora_model.parameters()])\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n  print(\n    f\"\"\"\n    {total_params} total params,\n    {trainable_params}\" trainable params,\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n    \"\"\"\n  )\n\n  6742609920 total params,\n  4194304 trainable params,\n  0.06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
+          "type": "text"
+        },
+        {
+          "text": "Result 5:\nDocument_id:cc646\nContent: ,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
+          "type": "text"
+        },
+        {
+          "text": "END of knowledge_search tool results.\n",
+          "type": "text"
+        }
+      ],
+      "error_code": null,
+      "error_message": null,
+      "metadata": {
+        "document_ids": [
+          "cc6460bf-74ab-4d11-8d32-bc02144a4e79",
+          "cc6460bf-74ab-4d11-8d32-bc02144a4e79",
+          "cc6460bf-74ab-4d11-8d32-bc02144a4e79",
+          "cc6460bf-74ab-4d11-8d32-bc02144a4e79",
+          "cc6460bf-74ab-4d11-8d32-bc02144a4e79"
+        ]
+      }
+    }
+  },
   "()_[('kwargs', {'session_id': '<UUID>', 'query': 'How to use LoRA', 'vector_db_ids': ['vector_db_<UUID>']}), ('tool_name', 'knowledge_search')]": {
     "type": "value",
     "value": {
@@ -307,23 +371,23 @@
           "type": "text"
         },
         {
-          "text": "Result 1:\nDocument_id:f76dc\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
+          "text": "Result 1:\nDocument_id:ab1b9\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
           "type": "text"
         },
         {
-          "text": "Result 2:\nDocument_id:c4fc3\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+          "text": "Result 2:\nDocument_id:cc646\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
           "type": "text"
         },
         {
-          "text": "Result 3:\nDocument_id:de2d4\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+          "text": "Result 3:\nDocument_id:8bcf6\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
           "type": "text"
         },
         {
-          "text": "Result 4:\nDocument_id:c4fc3\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+          "text": "Result 4:\nDocument_id:cc646\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
           "type": "text"
         },
         {
-          "text": "Result 5:\nDocument_id:de2d4\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+          "text": "Result 5:\nDocument_id:8bcf6\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
           "type": "text"
         },
         {
@@ -335,11 +399,11 @@
       "error_message": null,
       "metadata": {
         "document_ids": [
-          "f76dc7f5-9648-4272-a579-c8387fb1408a",
-          "c4fc3cb6-6172-489e-90a7-b39d343e14c0",
-          "de2d49de-55de-44dd-9bca-6f4f6d633b0a",
-          "c4fc3cb6-6172-489e-90a7-b39d343e14c0",
-          "de2d49de-55de-44dd-9bca-6f4f6d633b0a"
+          "ab1b9c78-180f-48cb-bbef-c70a4a59e42d",
+          "cc6460bf-74ab-4d11-8d32-bc02144a4e79",
+          "8bcf61e4-98c4-41a7-87f9-833c1a4d2b28",
+          "cc6460bf-74ab-4d11-8d32-bc02144a4e79",
+          "8bcf61e4-98c4-41a7-87f9-833c1a4d2b28"
         ]
       }
     }
@@ -398,5 +462,41 @@
         ]
       }
     }
+  },
+  "()_[('kwargs', {'session_id': '<UUID>', 'query': 'when was the nba created', 'vector_db_ids': ['test-vector-db-<UUID>']}), ('tool_name', 'knowledge_search')]": {
+    "type": "value",
+    "value": {
+      "content": [
+        {
+          "text": "knowledge_search tool found 3 chunks:\nBEGIN of knowledge_search tool results.\n",
+          "type": "text"
+        },
+        {
+          "text": "Result 1:\nDocument_id:nba_w\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\n",
+          "type": "text"
+        },
+        {
+          "text": "Result 2:\nDocument_id:perpl\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\n\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\n    Konwinski was among the founding team at Databricks.\n    Yarats, the CTO, was an AI research scientist at Meta.\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\n",
+          "type": "text"
+        },
+        {
+          "text": "Result 3:\nDocument_id:perpl\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\n",
+          "type": "text"
+        },
+        {
+          "text": "END of knowledge_search tool results.\n",
+          "type": "text"
+        }
+      ],
+      "error_code": null,
+      "error_message": null,
+      "metadata": {
+        "document_ids": [
+          "nba_wiki",
+          "perplexity_wiki",
+          "perplexity_wiki"
+        ]
+      }
+    }
   }
 }
diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.pickle b/tests/integration/fixtures/recorded_responses/invoke_tool.pickle
index 33bccd4d34..a03204511a 100644
Binary files a/tests/integration/fixtures/recorded_responses/invoke_tool.pickle and b/tests/integration/fixtures/recorded_responses/invoke_tool.pickle differ