diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
index 886a36024..921beac27 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@@ -534,7 +534,10 @@ class ChatAgent(ShieldRunnerMixin):
         session_info = await self.storage.get_session_info(session_id)
         # if the session has a memory bank id, let the memory tool use it
         if session_info and session_info.vector_db_id:
-            toolgroup_args[RAG_TOOL_GROUP]["vector_db_ids"].append(session_info.vector_db_id)
+            if RAG_TOOL_GROUP not in toolgroup_args:
+                toolgroup_args[RAG_TOOL_GROUP] = {"vector_db_ids": [session_info.vector_db_id]}
+            else:
+                toolgroup_args[RAG_TOOL_GROUP]["vector_db_ids"].append(session_info.vector_db_id)
 
         output_attachments = []
 
diff --git a/tests/integration/agents/test_agents.py b/tests/integration/agents/test_agents.py
index ca97eb692..f221582c8 100644
--- a/tests/integration/agents/test_agents.py
+++ b/tests/integration/agents/test_agents.py
@@ -401,7 +401,19 @@ def test_rag_agent(llama_stack_client_with_mocked_inference, agent_config, rag_t
             assert expected_kw in response.output_message.content.lower()
 
 
-def test_rag_agent_with_attachments(llama_stack_client_with_mocked_inference, agent_config):
+@pytest.mark.parametrize(
+    "toolgroup",
+    [
+        dict(
+            name="builtin::rag/knowledge_search",
+            args={
+                "vector_db_ids": [],
+            },
+        ),
+        "builtin::rag/knowledge_search",
+    ],
+)
+def test_rag_agent_with_attachments(llama_stack_client_with_mocked_inference, agent_config, toolgroup):
     urls = ["chat.rst", "llama3.rst", "memory_optimizations.rst", "lora_finetune.rst"]
     documents = [
         Document(
@@ -414,14 +426,7 @@ def test_rag_agent_with_attachments(llama_stack_client_with_mocked_inference, ag
     ]
     agent_config = {
         **agent_config,
-        "toolgroups": [
-            dict(
-                name="builtin::rag/knowledge_search",
-                args={
-                    "vector_db_ids": [],
-                },
-            )
-        ],
+        "toolgroups": [toolgroup],
     }
     rag_agent = Agent(llama_stack_client_with_mocked_inference, agent_config)
     session_id = rag_agent.create_session(f"test-session-{uuid4()}")
diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.json b/tests/integration/fixtures/recorded_responses/chat_completion.json
index 8a4bae93d..021b6c936 100644
--- a/tests/integration/fixtures/recorded_responses/chat_completion.json
+++ b/tests/integration/fixtures/recorded_responses/chat_completion.json
@@ -102,22 +102,7 @@
       {
         "event": {
           "delta": {
-            "text": " boiling point of polyjuice is -100 degrees",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Fahrenheit.",
+            "text": " boiling point of polyjuice is -100 degrees Fahrenheit.",
             "type": "text"
           },
           "event_type": {
@@ -381,7 +366,7 @@
                 "celcius": "false",
                 "liquid_name": "polyjuice"
               },
-              "call_id": "f9d5523a-6d3a-4cfc-b02d-a1204b591a86",
+              "call_id": "b9ded2e6-bef1-40bc-8a5b-a8c1018d0ba2",
               "tool_name": "get_boiling_point"
             },
             "type": "tool_call"
@@ -624,7 +609,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "name\": \"get_boiling_point\", \"parameters\": {\"liquid_name",
+            "tool_call": "name\": \"get_boiling_point\",",
             "type": "tool_call"
           },
           "event_type": {
@@ -643,7 +628,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "\": \"polyjuice\", \"celcius\": \"true",
+            "tool_call": " \"parameters\": {\"liquid_name\": \"polyju",
             "type": "tool_call"
           },
           "event_type": {
@@ -662,7 +647,26 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "\"}}",
+            "tool_call": "ice\", \"celcius\":",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " \"true\"}}",
             "type": "tool_call"
           },
           "event_type": {
@@ -686,7 +690,7 @@
                 "celcius": "true",
                 "liquid_name": "polyjuice"
               },
-              "call_id": "874df3c4-bc63-4f21-9353-4d0e4ce9c347",
+              "call_id": "98c011b5-f5de-416e-9a06-c2e3d0fa5581",
               "tool_name": "get_boiling_point"
             },
             "type": "tool_call"
@@ -827,7 +831,22 @@
       {
         "event": {
           "delta": {
-            "text": " boiling point of polyjuice is -100\u00b0C.",
+            "text": " boiling point of polyjuice is -100\u00b0C",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": ".",
             "type": "text"
           },
           "event_type": {
@@ -1046,7 +1065,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "{\"type\": \"function\", \"name\":",
+            "tool_call": "{\"type\": \"function\", \"name",
             "type": "tool_call"
           },
           "event_type": {
@@ -1065,7 +1084,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " \"get_boiling_point\", \"parameters",
+            "tool_call": "\": \"get_boiling_point\", \"parameters",
             "type": "tool_call"
           },
           "event_type": {
@@ -1084,7 +1103,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "\": {\"liquid_name\": \"polyjuice\", \"cel",
+            "tool_call": "\": {\"liquid_name\": \"polyjuice\", \"celci",
             "type": "tool_call"
           },
           "event_type": {
@@ -1103,7 +1122,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "cius\": \"true\"}}",
+            "tool_call": "us\": \"true\"}}",
             "type": "tool_call"
           },
           "event_type": {
@@ -1127,7 +1146,7 @@
                 "celcius": "true",
                 "liquid_name": "polyjuice"
               },
-              "call_id": "832c5abc-4369-4a2e-b85f-e7452f634e6c",
+              "call_id": "15326d2e-d284-4c7e-86b1-5bfbba74a914",
               "tool_name": "get_boiling_point"
             },
             "type": "tool_call"
@@ -1200,22 +1219,7 @@
       {
         "event": {
           "delta": {
-            "text": " customer smiled and said \"hello\" to the friendly store",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " clerk.",
+            "text": " customer smiled and said \"hello\" to the friendly store clerk.",
             "type": "text"
           },
           "event_type": {
@@ -1634,6 +1638,269 @@
     ],
     "type": "generator"
   },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())\\nprint(df.info())\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "The",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " error message indicates that the `bwrap.core` module is",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " not found. This is likely because the",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " `bwrap` package is not installed. To fix this,",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " you can install the `bwrap` package",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " using pip:\n\n```\npip install bwrap",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "\n```\n\nHowever, if you don't",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " have permission to install packages, you can use",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " the `knowledge_search` function to get information about",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " the CSV file instead:\n\n```\n{\n   ",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " \"type\": \"function\",\n    \"name\": \"",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "knowledge_search\",\n    \"parameters\": {\n",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "        \"query\": \"describe a csv file\"\n    }\n",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "}\n```\n\nThis will return a description of",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " the CSV file.",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
   "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())\\nprint(df.info())\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
     "chunks": [
       {
@@ -1852,6 +2119,208 @@
     ],
     "type": "generator"
   },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "started"
+            },
+            "tool_call": "",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "import pandas as pd\ndf = pd.read",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "_csv(\"/var/folders/cz/vyh7y1d11",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "xg881lsxsshnc5c0000gn/T/tmpc_",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "ozqkdv/GwQ6oJB4inflation",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": ".csv\")\nprint(df.head())\nprint(df.info())\nprint(df.describe",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "())",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "succeeded"
+            },
+            "tool_call": {
+              "arguments": {
+                "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpc_ozqkdv/GwQ6oJB4inflation.csv\")\nprint(df.head())\nprint(df.info())\nprint(df.describe())"
+              },
+              "call_id": "551648f3-c903-44ef-84ae-0f1dcbaaa68f",
+              "tool_name": {
+                "__enum__": "BuiltinTool",
+                "value": "code_interpreter"
+              }
+            },
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
   "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
     "chunks": [
       {
@@ -2097,7 +2566,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "import pandas as pd\ndf = pd.read_csv(\"/var",
+            "tool_call": "import pandas as pd\ndf = pd.read",
             "type": "tool_call"
           },
           "event_type": {
@@ -2116,7 +2585,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "/folders/cz/vyh7y1d11xg",
+            "tool_call": "_csv(\"/var/folders/cz/vyh",
             "type": "tool_call"
           },
           "event_type": {
@@ -2135,7 +2604,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "881lsxsshnc5c0000gn/T/tmpkbnyor",
+            "tool_call": "7y1d11xg881lsxsshnc5c",
             "type": "tool_call"
           },
           "event_type": {
@@ -2154,7 +2623,45 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "uj/fzDfYIPeinflation.csv\")\ndf.head()",
+            "tool_call": "0000gn/T/tmpc_ozqkdv/Gw",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "Q6oJB4inflation.csv\")\n",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "print(df.head())",
             "type": "tool_call"
           },
           "event_type": {
@@ -2175,9 +2682,9 @@
             },
             "tool_call": {
               "arguments": {
-                "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpkbnyoruj/fzDfYIPeinflation.csv\")\ndf.head()"
+                "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpc_ozqkdv/GwQ6oJB4inflation.csv\")\nprint(df.head())"
               },
-              "call_id": "df6b121d-9ad2-4d15-9fae-26c31f4c13c5",
+              "call_id": "204b3ad9-ff20-4fab-a055-13da99874d88",
               "tool_name": {
                 "__enum__": "BuiltinTool",
                 "value": "code_interpreter"
@@ -3698,6 +4205,1107 @@
     ],
     "type": "generator"
   },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a remote server, you can use the `requests` library to download the file and then load it into a pandas dataframe. \\n\\nHere is an example of how you can do it:\\n\\n```\\nimport pandas as pd\\nimport requests\\n\\n# Download the csv file\\nurl = \"https://example.com/your_file.csv\"\\nresponse = requests.get(url)\\n\\n# Load the csv file into a pandas dataframe\\ndf = pd.read_csv(response.content)\\n\\n# Print the description of the dataframe\\nprint(df.describe())\\n```\\n\\nPlease replace the `url` variable with the actual URL of your csv file. \\n\\nIf you are using a local file, you can simply use the `pd.read_csv()` function with the file path:\\n\\n```\\nimport pandas as pd\\n\\n# Load the csv file into a pandas dataframe\\ndf = pd.read_csv(\\'your_file.csv\\')\\n\\n# Print the description of the dataframe\\nprint(df.describe())\\n```\\n\\nPlease replace `\\'your_file.csv\\'` with the actual path to your csv file.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Convert \\'Year\\' column to datetime\\ndf[\\'Year\\'] = pd.to_datetime(df[\\'Year\\'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(\\'Year\\')[\\'Inflation\\'].mean().reset_index()\\n\\n# Plot average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation[\\'Year\\'], average_inflation[\\'Inflation\\'], marker=\\'o\\')\\nplt.title(\\'Average Yearly Inflation\\')\\nplt.xlabel(\\'Year\\')\\nplt.ylabel(\\'Inflation Rate\\')\\nplt.grid(True)\\nplt.show()'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "It",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " seems that the file \"/var/f",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "olders/cz/vyh7y",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "1d11xg881lsx",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "sshnc5c0000gn",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "/T/tmpc_ozqkdv/EzGU",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "QEnJinflation.csv\" does",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " not exist. \n\nTo plot the average yearly inflation as a",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " time series, you need to provide the actual file path or",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " the file itself. If you are using a remote server,",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " you can use the `requests` library to download the file",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " and then load it into a pandas dataframe. \n\nHere",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " is an example of how you can do it:\n\n```\nimport",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " pandas as pd\nimport matplotlib.pyplot as plt\nimport requests\n\n",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "# Download the csv file\nurl = \"https://example.com",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "/your_file.csv\"\nresponse = requests.get(url)\n\n# Load",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " the csv file into a pandas dataframe\ndf = pd.read_csv",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "(response.content)\n\n# Convert 'Year' column to datetime\ndf",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "['Year'] = pd.to_datetime(df['Year'])\n\n# Group",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " by year and calculate average inflation\naverage_inflation = df.groupby",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "('Year')['Inflation'].mean().reset_index()\n\n# Plot",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " average yearly inflation as a time series\n",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "plt.figure(figsize=(10,6))\nplt.plot(average_in",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "flation['Year'], average_inflation['Inflation'], marker='",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "o')\nplt.title('Average Yearly Inflation')\nplt.xlabel",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "('Year')\nplt.ylabel('Inflation Rate')\nplt.grid(True",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": ")\nplt.show()\n```\n\nPlease replace the",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " `url` variable with the actual URL of",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " your csv file. \n\nIf you",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " are using a local file, you can",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " simply use the `pd.read_csv()` function with the file",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " path:\n\n```\nimport pandas as pd\nimport matplotlib.pyplot as",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " plt\n\n# Load the csv file into a pandas dataframe\ndf",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " = pd.read_csv('your_file.csv')\n\n# Convert 'Year",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "' column to datetime\ndf['Year'] = pd.to_datetime",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "(df['Year'])\n\n# Group by",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " year and calculate average inflation\naverage_inflation = df.groupby('",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "Year')['Inflation'].mean().reset_index()\n\n# Plot average",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " yearly inflation as a time series\nplt.figure",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "(figsize=(10,6))\nplt.plot(average_inflation",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "['Year'], average_inflation['Inflation'], marker='o",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "')\nplt.title('Average Yearly Inflation')\nplt.xlabel('",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "Year')\nplt.ylabel('Inflation Rate')\nplt.grid(True)\n",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "plt.show()\n```\n\nPlease replace `'",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "your_file.csv'` with the actual",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " path to your csv file.",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a remote server, you can use the `requests` library to download the file and then load it into a pandas dataframe. \\n\\nHere is an example of how you can do it:\\n\\n```\\nimport pandas as pd\\nimport requests\\n\\n# Download the csv file\\nurl = \"https://example.com/your_file.csv\"\\nresponse = requests.get(url)\\n\\n# Load the csv file into a pandas dataframe\\ndf = pd.read_csv(response.content)\\n\\n# Print the description of the dataframe\\nprint(df.describe())\\n```\\n\\nPlease replace the `url` variable with the actual URL of your csv file. \\n\\nIf you are using a local file, you can simply use the `pd.read_csv()` function with the file path:\\n\\n```\\nimport pandas as pd\\n\\n# Load the csv file into a pandas dataframe\\ndf = pd.read_csv(\\'your_file.csv\\')\\n\\n# Print the description of the dataframe\\nprint(df.describe())\\n```\\n\\nPlease replace `\\'your_file.csv\\'` with the actual path to your csv file.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "started"
+            },
+            "tool_call": "",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " data\ndf = pd.read_csv(\"/var/folders/cz",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "/vyh7y1d11x",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "g881lsxsshnc5c0000gn/T/tmpc",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "_ozqkdv/EzGUQEnJinflation",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": ".csv\")\n\n# Convert 'Year' column",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " to datetime\ndf['Year'] = pd.to_datetime(df['",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "Year'])\n\n# Group by year and calculate average inflation\naverage_in",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "flation = df.groupby('Year')['Inflation'].mean().reset",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "_index()\n\n# Plot average yearly inflation as a time series\nplt",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": ".figure(figsize=(10,6))\nplt",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": ".plot(average_inflation['Year'], average_inflation['In",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "flation'], marker='o')\nplt.title('Average Yearly Inflation')\n",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "plt.xlabel('Year')\nplt.ylabel('Inflation Rate')\nplt.grid(True",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": ")\nplt.show()",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "succeeded"
+            },
+            "tool_call": {
+              "arguments": {
+                "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpc_ozqkdv/EzGUQEnJinflation.csv\")\n\n# Convert 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation['Year'], average_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation Rate')\nplt.grid(True)\nplt.show()"
+              },
+              "call_id": "7e62f796-c5cd-4021-a651-b0048b75a083",
+              "tool_name": {
+                "__enum__": "BuiltinTool",
+                "value": "code_interpreter"
+              }
+            },
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
   "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
     "chunks": [
       {
@@ -3748,7 +5356,7 @@
       {
         "event": {
           "delta": {
-            "text": "olders/cz/vyh7y1",
+            "text": "olders/cz/vyh7y1d11x",
             "type": "text"
           },
           "event_type": {
@@ -3763,7 +5371,7 @@
       {
         "event": {
           "delta": {
-            "text": "d11xg881lsxsshnc5c0000",
+            "text": "g881lsxsshnc5c000",
             "type": "text"
           },
           "event_type": {
@@ -3778,7 +5386,7 @@
       {
         "event": {
           "delta": {
-            "text": "gn/T/tmpkbnyoruj/lbnHmUP",
+            "text": "0gn/T/tmpc",
             "type": "text"
           },
           "event_type": {
@@ -3793,7 +5401,7 @@
       {
         "event": {
           "delta": {
-            "text": "2inflation.csv\" does not exist. \n\nTo describe",
+            "text": "_ozqkdv/EzGUQEnJinflation",
             "type": "text"
           },
           "event_type": {
@@ -3808,7 +5416,7 @@
       {
         "event": {
           "delta": {
-            "text": " the csv file, you need to provide the actual file",
+            "text": ".csv\" does not exist. \n\nTo",
             "type": "text"
           },
           "event_type": {
@@ -3823,7 +5431,7 @@
       {
         "event": {
           "delta": {
-            "text": " path or the file itself. If you are using a local file",
+            "text": " describe the csv file, you need to provide the actual file",
             "type": "text"
           },
           "event_type": {
@@ -3838,7 +5446,7 @@
       {
         "event": {
           "delta": {
-            "text": ", you can use the `load_data` function from the `",
+            "text": " path or the file itself. If you",
             "type": "text"
           },
           "event_type": {
@@ -3853,7 +5461,7 @@
       {
         "event": {
           "delta": {
-            "text": "code_interpreter` library to load the file. \n\nHere is",
+            "text": " are using a remote server, you can use the `requests` library",
             "type": "text"
           },
           "event_type": {
@@ -3868,7 +5476,7 @@
       {
         "event": {
           "delta": {
-            "text": " an example of how you can do it:\n\n```\nimport pandas",
+            "text": " to download the file and then load it into a pandas dataframe. \n\nHere",
             "type": "text"
           },
           "event_type": {
@@ -3883,7 +5491,7 @@
       {
         "event": {
           "delta": {
-            "text": " as pd\nfrom code_interpreter import load_data\n\n# Load",
+            "text": " is an example of how you can do it:\n\n```\nimport pandas as",
             "type": "text"
           },
           "event_type": {
@@ -3898,7 +5506,7 @@
       {
         "event": {
           "delta": {
-            "text": " data\ndf = load_data('inflation.csv')\n\n# Print",
+            "text": " pd\nimport requests\n\n# Download the csv file\nurl = \"https",
             "type": "text"
           },
           "event_type": {
@@ -3913,7 +5521,7 @@
       {
         "event": {
           "delta": {
-            "text": " summary of the data\nprint(df.head())\nprint(df.info())\n",
+            "text": "://example.com/your_file.csv\"\nresponse = requests.get(url)\n\n#",
             "type": "text"
           },
           "event_type": {
@@ -3928,7 +5536,7 @@
       {
         "event": {
           "delta": {
-            "text": "print(df.describe())\n```\n\nThis will load the csv file and print",
+            "text": " Load the csv file into a pandas dataframe\ndf",
             "type": "text"
           },
           "event_type": {
@@ -3943,7 +5551,7 @@
       {
         "event": {
           "delta": {
-            "text": " the first few rows, a summary of the data, and some descriptive statistics",
+            "text": " = pd.read_csv(response.content)\n\n# Print",
             "type": "text"
           },
           "event_type": {
@@ -3958,7 +5566,7 @@
       {
         "event": {
           "delta": {
-            "text": ". \n\nPlease replace 'inflation.csv' with the actual path to your",
+            "text": " the description of the dataframe\nprint",
             "type": "text"
           },
           "event_type": {
@@ -3973,7 +5581,7 @@
       {
         "event": {
           "delta": {
-            "text": " csv file. \n\nIf you are using a",
+            "text": "(df.describe())\n```\n\nPlease replace the `url`",
             "type": "text"
           },
           "event_type": {
@@ -3988,7 +5596,7 @@
       {
         "event": {
           "delta": {
-            "text": " remote file, you need to provide the actual file path or",
+            "text": " variable with the actual URL of your csv file. \n\nIf",
             "type": "text"
           },
           "event_type": {
@@ -4003,7 +5611,7 @@
       {
         "event": {
           "delta": {
-            "text": " the file itself. \n\nPlease provide the actual file path or the",
+            "text": " you are using a",
             "type": "text"
           },
           "event_type": {
@@ -4018,7 +5626,7 @@
       {
         "event": {
           "delta": {
-            "text": " file itself, and I will be happy to help you describe it",
+            "text": " local file, you can simply use the `pd.read_csv",
             "type": "text"
           },
           "event_type": {
@@ -4033,7 +5641,112 @@
       {
         "event": {
           "delta": {
-            "text": ".",
+            "text": "()` function with the file path:\n\n```\nimport pandas as",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " pd\n\n#",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " Load the csv file into a pandas",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " dataframe\ndf = pd.read_csv('your",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "_file.csv')\n\n# Print the description of",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " the dataframe\nprint(df.describe())\n``",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "`\n\nPlease replace `'your_file.csv'` with the actual path",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " to your csv file.",
             "type": "text"
           },
           "event_type": {
@@ -4109,7 +5822,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "import pandas as pd\n# Load",
+            "tool_call": "import pandas as pd\n# Load data\ndf = pd",
             "type": "tool_call"
           },
           "event_type": {
@@ -4128,7 +5841,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " data\ndf = pd.read_csv(\"/",
+            "tool_call": ".read_csv(\"/var",
             "type": "tool_call"
           },
           "event_type": {
@@ -4147,7 +5860,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "var/f",
+            "tool_call": "/folders/cz/vyh7y1d11xg881",
             "type": "tool_call"
           },
           "event_type": {
@@ -4166,7 +5879,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "olders/cz/vyh7y1d11x",
+            "tool_call": "lsxsshnc5c0000gn/T/tmpc_oz",
             "type": "tool_call"
           },
           "event_type": {
@@ -4185,7 +5898,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "g881lsxsshnc5c000",
+            "tool_call": "qkdv/EzGUQEnJinflation.csv\")\n",
             "type": "tool_call"
           },
           "event_type": {
@@ -4204,7 +5917,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "0gn/T/tmpkbnyoruj/l",
+            "tool_call": "# Rows\nprint(\"Number of rows and columns in the data",
             "type": "tool_call"
           },
           "event_type": {
@@ -4223,7 +5936,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "bnHmUP2inflation",
+            "tool_call": ":\", df.shape)\n# Columns\nprint(\"Columns of the data",
             "type": "tool_call"
           },
           "event_type": {
@@ -4242,7 +5955,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": ".csv\")\n# Rows\nprint(\"",
+            "tool_call": " are:\", len(df.columns))\n# Column names\n",
             "type": "tool_call"
           },
           "event_type": {
@@ -4261,7 +5974,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "Number of rows and columns in the data",
+            "tool_call": "print(\"Columns of the data are:\", df.columns)\n",
             "type": "tool_call"
           },
           "event_type": {
@@ -4280,7 +5993,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": ":\", df.shape)\n# Columns\n",
+            "tool_call": "# Column dtypes\nprint(\"Datatype of",
             "type": "tool_call"
           },
           "event_type": {
@@ -4299,83 +6012,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "print(\"Columns of the data are:\", len",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "(df.columns))\n# Column names\nprint(\"",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "Columns of the data are:\", df.columns)\n# Column dt",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "ypes\nprint(\"Datatype of the columns are:\", df",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": ".dtypes)",
+            "tool_call": " the columns are:\", df.dtypes)",
             "type": "tool_call"
           },
           "event_type": {
@@ -4396,9 +6033,9 @@
             },
             "tool_call": {
               "arguments": {
-                "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpkbnyoruj/lbnHmUP2inflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)"
+                "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpc_ozqkdv/EzGUQEnJinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)"
               },
-              "call_id": "c2d44218-eea1-408d-b332-cd82574e2b4e",
+              "call_id": "e57ec9d1-68d8-4493-b3d3-0fb683a4663a",
               "tool_name": {
                 "__enum__": "BuiltinTool",
                 "value": "code_interpreter"
@@ -4439,6 +6076,1745 @@
     ],
     "type": "generator"
   },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:71183\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:98cad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:84988\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:98cad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:84988\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='You can ask your question now. I will help you answer it using the knowledge_search tool results.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'How to use LoRA'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:98cad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:98cad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:84988\\nContent:  with training with LoRA quickly,\\njust specify any config with ``_lora`` in its name, e.g:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n\\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\\nwhich linear layers LoRA should be applied to in the model:\\n\\n* ``lora_attn_modules: List[str]`` accepts a list of strings specifying which layers of the model to apply\\n  LoRA to:\\n\\n  * ``q_proj`` applies LoRA to the query projection layer.\\n  * ``k_proj`` applies LoRA to the key projection layer.\\n  * ``v_proj`` applies LoRA to the value projection layer.\\n  * ``output_proj`` applies LoRA to the attention output projection layer.\\n\\n  Whilst adding more layers to be fine-tuned may improve model accuracy,\\n  this will come at the cost of increased memory usage and reduced training speed.\\n\\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\\n* ``apply_lora_to_output: Bool`` applies LoRA to the model\\'s final output projection.\\n  This is usually a projection to vocabulary space (e.g. in language models), but\\n  other modelling tasks may have different projections - classifier models will project\\n  to the number of classes, for example\\n\\n.. note::\\n\\n  Models which use tied embeddings (such as Gemma and Qwen2 1.5B and 0.5B) for the\\n  final output projection do not support ``apply_lora_to_output``.\\n\\nThese are all specified under the ``model`` flag or config entry, i.e:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"]\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.llama3.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    model.lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\\n\\nSecondly, parameters which control the scale of the impact of LoRA on the model:\\n\\n* ``lora_rank: int`` affects the scale of\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:98cad\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:9c730\\nContent: ora_finetune_label>`.\\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial <qlora_finetune_label>`.\\n\\nLet\\'s take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n.. note::\\n    To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\\n\\nWe can also add :ref:`command-line overrides <cli_override>` as needed, e.g.\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n        checkpointer.checkpoint_dir=<checkpoint_dir> \\\\\\n        tokenizer.path=<checkpoint_dir>/tokenizer.model \\\\\\n        checkpointer.output_dir=<checkpoint_dir>\\n\\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ``<checkpoint_dir>`` used in the :ref:`tune download <tune_download_label>` command above,\\nthen save a final checkpoint in the same directory following the original format. For more details on the\\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive <understand_checkpointer>`.\\n\\n.. note::\\n    To see the full set of configurable parameters for this (and other) configs we can use :ref:`tune cp <tune_cp_cli_label>` to copy (and modify)\\n    the default config. :ref:`tune cp <tune_cp_cli_label>` can be used with recipe scripts too, in case you want to make more custom changes\\n    that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp <tune_cp_cli_label>` see the section on\\n    :ref:`modifying configs <tune_cp_label>` in our \":ref:`finetune_llama_label`\" tutorial.\\n\\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights\\nwill\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "To",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " use LoRA, you can follow these steps",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": ":\n\n1.  Install the necessary packages",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": ", including torchtune and the Llama2 model.\n",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "2.  Load the Llama2 model and specify which",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " layers to apply LoRA to.\n3.",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "  Define the LoRA parameters, such as the rank and",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " alpha values.\n4.  Train the model using",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " the LoRA fine-tuning recipe in torchtune",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": ".\n5.  Use the trained model for inference or further fine",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "-tuning.\n\nHere is an example of how to apply Lo",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "RA to Llama2-7B:\n\n",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "```python\nfrom torchtune.models.llama2 import",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " llama2_7b, lora_llama2",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "_7b\n\n# Build Llama2 without any Lo",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "RA layers\nbase_model = llama2_7b()\n\n",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "# The default settings for lora_llama",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "2_7b will match those for",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " llama2_7b\n# We just need to define",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " which layers we want LoRA applied to.\n# Within each",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " self-attention, we can choose from [\"q_proj\",",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " \"k_proj\", \"v_proj\", and \"output_proj\"]",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": ".\n# We can also set apply_lora_to_mlp=True",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " or apply_lora_to_output=True to apply LoRA to other",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " linear\n# layers outside of the self-",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "attention.\nlora_model = lora_llama2_7",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "```\n\nYou can also customize the LoRA parameters",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " by specifying the rank and alpha values:\n\n```python",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "\nlora_model = lora_llama2_7b",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "(lora_attn_modules=[\"q_proj\", \"v_proj\"],",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " lora_rank=8, lora_alpha=16)\n``",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "`\n\nTo train the model using the LoRA",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " fine-tuning recipe in torchtune, you can use",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " the following command:\n\n```bash\ntune run lora_f",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "inetune_single_device --config llama3/8B_l",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "ora_single_device\n```\n\nThis will",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " load the Llama3-8B-Instruct checkpoint and",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " tokenizer from the specified directory, then save a final checkpoint in the",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " same directory following the original format.",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:71183\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:98cad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:84988\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:98cad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:84988\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='You can ask your question now. I will help you answer it using the knowledge_search tool results.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "{\"",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "type\": \"function\", \"name\": \"knowledge_search",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "\", \"parameters\": {\"query\": \"How to use Lo",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "RA\"}}",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "succeeded"
+            },
+            "tool_call": {
+              "arguments": {
+                "query": "How to use LoRA"
+              },
+              "call_id": "ee82ce77-7143-4b2f-8eb8-de5f31517b84",
+              "tool_name": "knowledge_search"
+            },
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:71183\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:98cad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:84988\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:98cad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:84988\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "You",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " can ask your question now. I will help you answer it using",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " the knowledge_search tool results.",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:7bdfa\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:64211\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:0c95c\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:64211\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:0c95c\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='You can use the following function call to answer the user\\'s question:\\n\\n{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters\": {\"query\": \"How to fine-tune a Llama2 model with LoRA in torchtune\"}}', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'How to use LoRA'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:64211\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:64211\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:0c95c\\nContent:  with training with LoRA quickly,\\njust specify any config with ``_lora`` in its name, e.g:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n\\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\\nwhich linear layers LoRA should be applied to in the model:\\n\\n* ``lora_attn_modules: List[str]`` accepts a list of strings specifying which layers of the model to apply\\n  LoRA to:\\n\\n  * ``q_proj`` applies LoRA to the query projection layer.\\n  * ``k_proj`` applies LoRA to the key projection layer.\\n  * ``v_proj`` applies LoRA to the value projection layer.\\n  * ``output_proj`` applies LoRA to the attention output projection layer.\\n\\n  Whilst adding more layers to be fine-tuned may improve model accuracy,\\n  this will come at the cost of increased memory usage and reduced training speed.\\n\\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\\n* ``apply_lora_to_output: Bool`` applies LoRA to the model\\'s final output projection.\\n  This is usually a projection to vocabulary space (e.g. in language models), but\\n  other modelling tasks may have different projections - classifier models will project\\n  to the number of classes, for example\\n\\n.. note::\\n\\n  Models which use tied embeddings (such as Gemma and Qwen2 1.5B and 0.5B) for the\\n  final output projection do not support ``apply_lora_to_output``.\\n\\nThese are all specified under the ``model`` flag or config entry, i.e:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"]\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.llama3.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    model.lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\\n\\nSecondly, parameters which control the scale of the impact of LoRA on the model:\\n\\n* ``lora_rank: int`` affects the scale of\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:64211\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:1d70c\\nContent: ora_finetune_label>`.\\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial <qlora_finetune_label>`.\\n\\nLet\\'s take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n.. note::\\n    To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\\n\\nWe can also add :ref:`command-line overrides <cli_override>` as needed, e.g.\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n        checkpointer.checkpoint_dir=<checkpoint_dir> \\\\\\n        tokenizer.path=<checkpoint_dir>/tokenizer.model \\\\\\n        checkpointer.output_dir=<checkpoint_dir>\\n\\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ``<checkpoint_dir>`` used in the :ref:`tune download <tune_download_label>` command above,\\nthen save a final checkpoint in the same directory following the original format. For more details on the\\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive <understand_checkpointer>`.\\n\\n.. note::\\n    To see the full set of configurable parameters for this (and other) configs we can use :ref:`tune cp <tune_cp_cli_label>` to copy (and modify)\\n    the default config. :ref:`tune cp <tune_cp_cli_label>` can be used with recipe scripts too, in case you want to make more custom changes\\n    that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp <tune_cp_cli_label>` see the section on\\n    :ref:`modifying configs <tune_cp_label>` in our \":ref:`finetune_llama_label`\" tutorial.\\n\\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights\\nwill\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "To",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " use LoRA, you can follow these steps:\n\n",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "1.  Install the necessary packages",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": ", including torchtune and the Llama",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "2 model.\n2.  Load the",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " Llama2 model and specify which layers",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " to apply LoRA to.\n3.  Define the",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " LoRA parameters, such as the rank",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " and alpha values.\n4.  Train the model using",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " the LoRA fine-tuning recipe in torchtune.\n\n",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "Here is an example of how to use Lo",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "RA with the Llama2 model:\n\n```python\nfrom",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " torchtune.models.llama2 import",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " llama2_7b, lora_llama2_7",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "b\n\n# Build Llama2 without any LoRA layers\n",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "base_model = llama2_7b()\n\n# The default settings",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " for lora_llama2_7b will match those",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " for llama2_7b\n#",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " We just need to define which layers we",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " want LoRA applied to.\n# Within each self-attention",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": ", we can choose from [\"q_proj\", \"k_proj",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "\", \"v_proj\", and \"output_proj\"].\n#",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " We can also set apply_lora_to_mlp=True or",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " apply_lora_to_output=True to apply LoRA to other",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " linear\n# layers outside of the self-attention.\nl",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "ora_model = lora_llama",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "2_7b(lora_attn",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "_modules=[\"q_proj\", \"v_proj\"])\n\n# Print the",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " first layer's self-attention in the usual Llama2",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " model\nprint(base_model.layers[0",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "].attn)\n# Print the same for Llama2 with",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " LoRA weights\nprint(lora_model.layers[0].",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "attn)\n```\n\nThis code will load the Llama",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "2 model and apply LoRA to the",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " specified layers. You can then train the model using the Lo",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "RA fine-tuning recipe in torchtune",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": ".\n\nNote that you will need to modify the code to suit",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " your specific use case and requirements. Additionally,",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " you may need to adjust the LoRA parameters and the training",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " settings to achieve the desired results.",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:7bdfa\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:64211\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:0c95c\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:64211\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:0c95c\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='You can use the following function call to answer the user\\'s question:\\n\\n{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters\": {\"query\": \"How to fine-tune a Llama2 model with LoRA in torchtune\"}}', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "{\"",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "type\": \"function\", \"name\": \"knowledge_search\", \"",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "parameters\": {\"query\": \"How to use LoRA\"}}",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "succeeded"
+            },
+            "tool_call": {
+              "arguments": {
+                "query": "How to use LoRA"
+              },
+              "call_id": "ce86a63d-964a-49a0-8488-29c28ecb2f80",
+              "tool_name": "knowledge_search"
+            },
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:7bdfa\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:64211\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:0c95c\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:64211\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:0c95c\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "You",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " can use the following function call to answer",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " the user's question:\n\n{\"type\": \"function\", \"",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "name\": \"knowledge_search\", \"parameters\": {\"query\":",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " \"How to fine-tune a L",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "lama2 model with LoRA in torch",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "tune\"}}",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
   "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:c4b2d\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:606ad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:e37c3\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:606ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:e37c3\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='You can use the following function call to answer the user\\'s question:\\n\\n{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters\": {\"query\": \"How to fine-tune a Llama2 model with LoRA in torchtune\"}}', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'How to use LoRA'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:606ad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:606ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:e37c3\\nContent:  with training with LoRA quickly,\\njust specify any config with ``_lora`` in its name, e.g:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n\\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\\nwhich linear layers LoRA should be applied to in the model:\\n\\n* ``lora_attn_modules: List[str]`` accepts a list of strings specifying which layers of the model to apply\\n  LoRA to:\\n\\n  * ``q_proj`` applies LoRA to the query projection layer.\\n  * ``k_proj`` applies LoRA to the key projection layer.\\n  * ``v_proj`` applies LoRA to the value projection layer.\\n  * ``output_proj`` applies LoRA to the attention output projection layer.\\n\\n  Whilst adding more layers to be fine-tuned may improve model accuracy,\\n  this will come at the cost of increased memory usage and reduced training speed.\\n\\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\\n* ``apply_lora_to_output: Bool`` applies LoRA to the model\\'s final output projection.\\n  This is usually a projection to vocabulary space (e.g. in language models), but\\n  other modelling tasks may have different projections - classifier models will project\\n  to the number of classes, for example\\n\\n.. note::\\n\\n  Models which use tied embeddings (such as Gemma and Qwen2 1.5B and 0.5B) for the\\n  final output projection do not support ``apply_lora_to_output``.\\n\\nThese are all specified under the ``model`` flag or config entry, i.e:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"]\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.llama3.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    model.lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\\n\\nSecondly, parameters which control the scale of the impact of LoRA on the model:\\n\\n* ``lora_rank: int`` affects the scale of\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:606ad\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:0b7ba\\nContent: ora_finetune_label>`.\\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial <qlora_finetune_label>`.\\n\\nLet\\'s take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n.. note::\\n    To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\\n\\nWe can also add :ref:`command-line overrides <cli_override>` as needed, e.g.\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n        checkpointer.checkpoint_dir=<checkpoint_dir> \\\\\\n        tokenizer.path=<checkpoint_dir>/tokenizer.model \\\\\\n        checkpointer.output_dir=<checkpoint_dir>\\n\\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ``<checkpoint_dir>`` used in the :ref:`tune download <tune_download_label>` command above,\\nthen save a final checkpoint in the same directory following the original format. For more details on the\\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive <understand_checkpointer>`.\\n\\n.. note::\\n    To see the full set of configurable parameters for this (and other) configs we can use :ref:`tune cp <tune_cp_cli_label>` to copy (and modify)\\n    the default config. :ref:`tune cp <tune_cp_cli_label>` can be used with recipe scripts too, in case you want to make more custom changes\\n    that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp <tune_cp_cli_label>` see the section on\\n    :ref:`modifying configs <tune_cp_label>` in our \":ref:`finetune_llama_label`\" tutorial.\\n\\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights\\nwill\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
     "chunks": [
       {
@@ -6446,7 +9822,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "{\"type\": \"function\", \"name",
+            "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search",
             "type": "tool_call"
           },
           "event_type": {
@@ -6465,7 +9841,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "\": \"knowledge_search\", \"parameters\": {\"query\": \"Tor",
+            "tool_call": "\", \"parameters\": {\"query\": \"",
             "type": "tool_call"
           },
           "event_type": {
@@ -6484,7 +9860,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "chtune documentation\"}}",
+            "tool_call": "Torchtune documentation\"}}",
             "type": "tool_call"
           },
           "event_type": {
@@ -6507,7 +9883,7 @@
               "arguments": {
                 "query": "Torchtune documentation"
               },
-              "call_id": "96e0974a-8831-4440-af01-9d42c2a46306",
+              "call_id": "6ec2bf0f-42f3-453d-ad5f-52bc6e0267b7",
               "tool_name": "knowledge_search"
             },
             "type": "tool_call"
@@ -6580,7 +9956,7 @@
       {
         "event": {
           "delta": {
-            "text": "lama3-8B uses grouped-query attention",
+            "text": "lama3-8B uses grouped-query attention instead of the standard multi-head",
             "type": "text"
           },
           "event_type": {
@@ -6595,22 +9971,7 @@
       {
         "event": {
           "delta": {
-            "text": " instead of the standard multi-head attention from Llama2-7",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "B.",
+            "text": " attention from Llama2-7B.",
             "type": "text"
           },
           "event_type": {
@@ -6678,22 +10039,7 @@
       {
         "event": {
           "delta": {
-            "text": " attention type used by Llama3-8B is grouped-query",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " attention.",
+            "text": " attention type used by Llama3-8B is grouped-query attention.",
             "type": "text"
           },
           "event_type": {
@@ -6761,7 +10107,7 @@
       {
         "event": {
           "delta": {
-            "text": "    \"type\": \"function\",\n    \"name\": \"knowledge",
+            "text": "    \"type\": \"function\",\n   ",
             "type": "text"
           },
           "event_type": {
@@ -6776,7 +10122,7 @@
       {
         "event": {
           "delta": {
-            "text": "_search\",\n    \"parameters\": {\n        \"query\": \"L",
+            "text": " \"name\": \"knowledge_search\",\n    \"parameters\": {\n        \"",
             "type": "text"
           },
           "event_type": {
@@ -6791,7 +10137,37 @@
       {
         "event": {
           "delta": {
-            "text": "lama3-8B attention type\"\n    }\n}",
+            "text": "query\": \"Llama3",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "-8B attention type\"\n    }\n",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "}",
             "type": "text"
           },
           "event_type": {
@@ -6814,7 +10190,7 @@
               "arguments": {
                 "query": "Llama3-8B attention type"
               },
-              "call_id": "8c86f3e4-1312-4857-8baa-91e23bfd33a4",
+              "call_id": "95471ab3-196c-45ba-a7f1-7585026662c2",
               "tool_name": "knowledge_search"
             },
             "type": "tool_call"
@@ -6895,7 +10271,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",",
+            "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"",
             "type": "tool_call"
           },
           "event_type": {
@@ -6914,26 +10290,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " \"parameters\": {\"query\": \"Llama3-8B",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " attention type\"}}",
+            "tool_call": "parameters\": {\"query\": \"Llama3-8B attention type\"}}",
             "type": "tool_call"
           },
           "event_type": {
@@ -6956,7 +10313,7 @@
               "arguments": {
                 "query": "Llama3-8B attention type"
               },
-              "call_id": "652117f8-9427-4090-a0c7-c7d03f94ea74",
+              "call_id": "f026154f-72fb-47aa-828c-065bd5a16267",
               "tool_name": "knowledge_search"
             },
             "type": "tool_call"
@@ -6994,6 +10351,74 @@
     ],
     "type": "generator"
   },
+  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Search the web and tell me who the current CEO of Meta is.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.brave_search: 'brave_search'>, arguments={'query': 'current CEO of Meta'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.brave_search: 'brave_search'>, content='{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\", \"score\": 0.8190992, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\", \"score\": 0.79099923, \"raw_content\": null}, {\"title\": \"Meet the Executive CSuite Team of Meta (Facebook) [2025]\", \"url\": \"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\", \"content\": \"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\", \"score\": 0.7602419, \"raw_content\": null}, {\"title\": \"Meta to spend up to $65 billion this year to power AI goals, Zuckerberg ...\", \"url\": \"https://www.reuters.com/technology/meta-invest-up-65-bln-capital-expenditure-this-year-2025-01-24/\", \"content\": \"Meta Platforms plans to spend as much as $65 billion this year to expand its AI infrastructure, CEO Mark Zuckerberg said on Friday, aiming to bolster the company\\'s position against rivals OpenAI\", \"score\": 0.73914057, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Forbes\", \"url\": \"https://www.forbes.com/profile/mark-zuckerberg/\", \"content\": \"Meta CEO Mark Zuckerberg \\\\u201cloved\\\\u201d an image on Facebook known as \\\\\"Challah Horse\\\\\" that happens to be AI-generated, highlighting the amount of AI spam on the platform. ### Meta Donates $1 Million To Trump\\\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President Elect Meta has donated $1 million to President-elect Donald Trump\\\\u2019s inaugural fund, the company confirmed to various news outlets on Wednesday, a move that comes just weeks after its CEO Mark Zuckerberg met with Trump at his Mar-a-Lago residence in an apparent bid to mend years of strained ties. ### Meta Donates $1 Million To Trump\\\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President-Elect Read the full profile on Forbes: https://www.forbes.com/sites/kerryadolan/2023/09/26/mark-gets-meta-zuckerberg-talks-ai-and-that-musk-mma-fight-thats-never-going-to-happen/?sh=671046e73037\", \"score\": 0.6410185, \"raw_content\": null}]}')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.brave_search: 'brave_search'>, description='Search the web for information', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for', required=True, default=None)})])]": {
+    "chunks": [
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "start"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "The",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " current CEO of Meta is Mark Zuckerberg.",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "complete"
+          },
+          "logprobs": null,
+          "stop_reason": {
+            "__enum__": "StopReason",
+            "value": "end_of_turn"
+          }
+        },
+        "metrics": null
+      }
+    ],
+    "type": "generator"
+  },
   "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Search the web and tell me who the current CEO of Meta is.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.brave_search: 'brave_search'>, arguments={'query': 'current CEO of Meta'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.brave_search: 'brave_search'>, content='{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\", \"score\": 0.8190992, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\", \"score\": 0.79099923, \"raw_content\": null}, {\"title\": \"Meet the Executive CSuite Team of Meta (Facebook) [2025]\", \"url\": \"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\", \"content\": \"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\", \"score\": 0.7602419, \"raw_content\": null}, {\"title\": \"Meta to spend up to $65 billion this year to power AI goals, Zuckerberg ...\", \"url\": \"https://www.reuters.com/technology/meta-invest-up-65-bln-capital-expenditure-this-year-2025-01-24/\", \"content\": \"Meta Platforms plans to spend as much as $65 billion this year to expand its AI infrastructure, CEO Mark Zuckerberg said on Friday, aiming to bolster the company\\'s position against rivals OpenAI\", \"score\": 0.73914057, \"raw_content\": null}, {\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\", \"score\": 0.6175132, \"raw_content\": null}]}')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.brave_search: 'brave_search'>, description='Search the web for information', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for', required=True, default=None)})])]": {
     "chunks": [
       {
@@ -7188,7 +10613,26 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "brave_search.call(query=\"current CEO of Meta\")",
+            "tool_call": "brave_search.call(query=\"current CEO of",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " Meta\")",
             "type": "tool_call"
           },
           "event_type": {
@@ -7211,7 +10655,7 @@
               "arguments": {
                 "query": "current CEO of Meta"
               },
-              "call_id": "b5a3c852-c152-4397-b01d-cf0b55da1460",
+              "call_id": "b9ee4732-1663-429c-ae7d-186578174556",
               "tool_name": {
                 "__enum__": "BuiltinTool",
                 "value": "brave_search"
@@ -7385,7 +10829,7 @@
       {
         "event": {
           "delta": {
-            "text": " function `get_boiling_point` is not able to find the boiling point",
+            "text": " function `get_boiling_point` is not able to find",
             "type": "text"
           },
           "event_type": {
@@ -7400,7 +10844,7 @@
       {
         "event": {
           "delta": {
-            "text": " of polyjuice as it is a fictional liquid from the Harry Potter series",
+            "text": " the boiling point of polyjuice as it is a fictional",
             "type": "text"
           },
           "event_type": {
@@ -7415,7 +10859,22 @@
       {
         "event": {
           "delta": {
-            "text": ".",
+            "text": " liquid from the Harry Potter series. The",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": " function only works with real-world liquids.",
             "type": "text"
           },
           "event_type": {
@@ -7626,7 +11085,7 @@
       {
         "event": {
           "delta": {
-            "text": " able to find the boiling",
+            "text": " able to find the boiling point of polyjuice as it is",
             "type": "text"
           },
           "event_type": {
@@ -7641,7 +11100,7 @@
       {
         "event": {
           "delta": {
-            "text": " point of polyjuice as it is not a",
+            "text": " not a real liquid. Polyjuice is a magical potion from",
             "type": "text"
           },
           "event_type": {
@@ -7656,22 +11115,7 @@
       {
         "event": {
           "delta": {
-            "text": " real liquid. Polyjuice is a magical potion from the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Harry Potter series.",
+            "text": " the Harry Potter series.",
             "type": "text"
           },
           "event_type": {
@@ -7867,7 +11311,7 @@
       {
         "event": {
           "delta": {
-            "text": " not able to find the boiling point of polyjuice as",
+            "text": " not able to find the boiling point of polyjuice as it",
             "type": "text"
           },
           "event_type": {
@@ -7882,7 +11326,7 @@
       {
         "event": {
           "delta": {
-            "text": " it is not a real liquid. Polyjuice is a magical potion",
+            "text": " is not a real liquid. Polyjuice is",
             "type": "text"
           },
           "event_type": {
@@ -7897,7 +11341,7 @@
       {
         "event": {
           "delta": {
-            "text": " from the Harry Potter series.",
+            "text": " a magical potion from the Harry Potter series.",
             "type": "text"
           },
           "event_type": {
@@ -8115,7 +11559,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "{\"type\": \"function\", \"name\": \"get",
+            "tool_call": "{\"type\": \"function\", \"name\":",
             "type": "tool_call"
           },
           "event_type": {
@@ -8134,7 +11578,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "_boiling_point\", \"parameters\": {\"liquid_name\": \"poly",
+            "tool_call": " \"get_boiling_point\", \"parameters\":",
             "type": "tool_call"
           },
           "event_type": {
@@ -8153,7 +11597,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "juice\"}}",
+            "tool_call": " {\"liquid_name\": \"polyjuice\"}}",
             "type": "tool_call"
           },
           "event_type": {
@@ -8176,7 +11620,7 @@
               "arguments": {
                 "liquid_name": "polyjuice"
               },
-              "call_id": "c6384f37-a43d-4ead-a7d5-a9705c32551f",
+              "call_id": "a994859b-38d2-45d5-913e-359409ee8ae2",
               "tool_name": "get_boiling_point"
             },
             "type": "tool_call"
@@ -8399,7 +11843,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "{\"type\": \"function\", \"name\": \"get_bo",
+            "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
             "type": "tool_call"
           },
           "event_type": {
@@ -8418,7 +11862,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "iling_point\", \"parameters\": {\"liquid_name\": \"polyju",
+            "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"polyjuice",
             "type": "tool_call"
           },
           "event_type": {
@@ -8437,7 +11881,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "ice\"}}",
+            "tool_call": "\"}}",
             "type": "tool_call"
           },
           "event_type": {
@@ -8460,7 +11904,7 @@
               "arguments": {
                 "liquid_name": "polyjuice"
               },
-              "call_id": "386d264e-6a42-45dd-8b74-669dbb086014",
+              "call_id": "e48d4312-1a88-4759-9b9c-bc573c23fee6",
               "tool_name": "get_boiling_point"
             },
             "type": "tool_call"
@@ -8676,7 +12120,7 @@
       {
         "event": {
           "delta": {
-            "text": " couldn't find any information on the boiling",
+            "text": " couldn't find any information on the boiling point of Poly",
             "type": "text"
           },
           "event_type": {
@@ -8691,7 +12135,7 @@
       {
         "event": {
           "delta": {
-            "text": " point of Polyjuice. Polyjuice is a magical",
+            "text": "juice. Polyjuice is a magical potion in",
             "type": "text"
           },
           "event_type": {
@@ -8706,7 +12150,7 @@
       {
         "event": {
           "delta": {
-            "text": " potion in the Harry Potter series that allows the drinker to",
+            "text": " the Harry Potter series that allows the drinker",
             "type": "text"
           },
           "event_type": {
@@ -8721,7 +12165,7 @@
       {
         "event": {
           "delta": {
-            "text": " transform into someone else. It's not a physical substance with a",
+            "text": " to transform into someone else. It's not a physical substance",
             "type": "text"
           },
           "event_type": {
@@ -8736,7 +12180,7 @@
       {
         "event": {
           "delta": {
-            "text": " boiling point. If you have any other questions, I'd be",
+            "text": " with a boiling point. If you have any other questions, I'd",
             "type": "text"
           },
           "event_type": {
@@ -8751,7 +12195,7 @@
       {
         "event": {
           "delta": {
-            "text": " happy to help.",
+            "text": " be happy to help.",
             "type": "text"
           },
           "event_type": {
@@ -8969,7 +12413,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "{\"type\": \"function\", \"name\": \"get",
+            "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point\",",
             "type": "tool_call"
           },
           "event_type": {
@@ -8988,26 +12432,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "_boiling_point\", \"parameters\": {\"liquid_name",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\": \"polyjuice\"}}",
+            "tool_call": " \"parameters\": {\"liquid_name\": \"polyjuice\"}}",
             "type": "tool_call"
           },
           "event_type": {
@@ -9030,7 +12455,7 @@
               "arguments": {
                 "liquid_name": "polyjuice"
               },
-              "call_id": "71d947ad-d4d6-4a15-8ec5-d9bf890ed45c",
+              "call_id": "cd0e926b-b1c8-468b-8c55-b3e42e7ae89d",
               "tool_name": "get_boiling_point"
             },
             "type": "tool_call"
@@ -9103,7 +12528,7 @@
       {
         "event": {
           "delta": {
-            "text": " 100th prime number is",
+            "text": " 100th prime number is ",
             "type": "text"
           },
           "event_type": {
@@ -9118,7 +12543,7 @@
       {
         "event": {
           "delta": {
-            "text": " 541.",
+            "text": "541.",
             "type": "text"
           },
           "event_type": {
@@ -9232,7 +12657,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "\n    if n % 2 == 0 or n % 3",
+            "tool_call": "\n    if n % 2 ==",
             "type": "tool_call"
           },
           "event_type": {
@@ -9251,7 +12676,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " == 0:\n        return False\n    i = 5\n    while",
+            "tool_call": " 0 or n % 3 == 0:\n       ",
             "type": "tool_call"
           },
           "event_type": {
@@ -9270,7 +12695,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " i * i <= n:\n        if n %",
+            "tool_call": " return False\n    i = 5\n",
             "type": "tool_call"
           },
           "event_type": {
@@ -9289,7 +12714,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " i == 0 or n % (i",
+            "tool_call": "    while i * i <= n:\n        if n % i",
             "type": "tool_call"
           },
           "event_type": {
@@ -9308,7 +12733,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " + 2) == 0:\n            return False\n       ",
+            "tool_call": " == 0 or n % (i + 2) ==",
             "type": "tool_call"
           },
           "event_type": {
@@ -9327,7 +12752,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " i += 6\n    return True\n\ndef get_nth_prime",
+            "tool_call": " 0:\n            return False\n        i += 6\n",
             "type": "tool_call"
           },
           "event_type": {
@@ -9346,7 +12771,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "(n):\n    count = 0\n    num = 2",
+            "tool_call": "    return True\n\ndef get_nth_prime(n):\n    count =",
             "type": "tool_call"
           },
           "event_type": {
@@ -9365,7 +12790,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "\n    while True:\n        if is_prime(num):\n            count += ",
+            "tool_call": " 0\n    num = 2\n    while True:\n",
             "type": "tool_call"
           },
           "event_type": {
@@ -9384,7 +12809,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "1\n            if count == n:\n                return num\n        num +=",
+            "tool_call": "        if is_prime(num):\n            count += 1\n           ",
             "type": "tool_call"
           },
           "event_type": {
@@ -9403,7 +12828,45 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": " 1\n\nprint(get_nth_prime(100))",
+            "tool_call": " if count == n:\n                return num\n        num +=",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": " 1\n\nprint(get_nth_prime(",
+            "type": "tool_call"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "parse_status": {
+              "__enum__": "ToolCallParseStatus",
+              "value": "in_progress"
+            },
+            "tool_call": "100))",
             "type": "tool_call"
           },
           "event_type": {
@@ -9426,7 +12889,7 @@
               "arguments": {
                 "code": "def is_prime(n):\n    if n <= 1:\n        return False\n    if n <= 3:\n        return True\n    if n % 2 == 0 or n % 3 == 0:\n        return False\n    i = 5\n    while i * i <= n:\n        if n % i == 0 or n % (i + 2) == 0:\n            return False\n        i += 6\n    return True\n\ndef get_nth_prime(n):\n    count = 0\n    num = 2\n    while True:\n        if is_prime(num):\n            count += 1\n            if count == n:\n                return num\n        num += 1\n\nprint(get_nth_prime(100))"
               },
-              "call_id": "ee20e420-1f28-44be-b6e1-4672dec916d8",
+              "call_id": "a184cbe8-b941-472d-9254-fda5ed8d770f",
               "tool_name": {
                 "__enum__": "BuiltinTool",
                 "value": "code_interpreter"
@@ -9638,7 +13101,22 @@
       {
         "event": {
           "delta": {
-            "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters\":",
+            "text": "type\": \"function\", \"name\": \"",
+            "type": "text"
+          },
+          "event_type": {
+            "__enum__": "ChatCompletionResponseEventType",
+            "value": "progress"
+          },
+          "logprobs": null,
+          "stop_reason": null
+        },
+        "metrics": null
+      },
+      {
+        "event": {
+          "delta": {
+            "text": "knowledge_search\", \"parameters\":",
             "type": "text"
           },
           "event_type": {
@@ -9676,7 +13154,7 @@
               "arguments": {
                 "query": "Perplexity company founding date"
               },
-              "call_id": "ad2b7b43-e9b7-41ff-91f8-150f9ae8b213",
+              "call_id": "9ad1f31d-4fb3-40e6-8037-0cc50794d6ce",
               "tool_name": "knowledge_search"
             },
             "type": "tool_call"
@@ -9925,7 +13403,7 @@
               "arguments": {
                 "query": "Perplexity company founding date"
               },
-              "call_id": "d3ccf807-0bd6-47c4-98c0-d3c603b8b3ca",
+              "call_id": "11c1dca5-6754-4ba6-8337-1bb8a538342f",
               "tool_name": "knowledge_search"
             },
             "type": "tool_call"
@@ -10140,7 +13618,7 @@
       {
         "event": {
           "delta": {
-            "text": " NBA was created on August ",
+            "text": " NBA was created on August 3, ",
             "type": "text"
           },
           "event_type": {
@@ -10155,7 +13633,7 @@
       {
         "event": {
           "delta": {
-            "text": "3, 1949, with",
+            "text": "1949, with the merger of the Basketball Association of America",
             "type": "text"
           },
           "event_type": {
@@ -10170,37 +13648,7 @@
       {
         "event": {
           "delta": {
-            "text": " the merger of the Basketball Association of",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " America (BAA) and the National Basketball League",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " (NBL).",
+            "text": " (BAA) and the National Basketball League (NBL).",
             "type": "text"
           },
           "event_type": {
@@ -10389,7 +13837,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "{\"type\": \"function\", \"",
+            "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters\":",
             "type": "tool_call"
           },
           "event_type": {
@@ -10408,45 +13856,7 @@
               "__enum__": "ToolCallParseStatus",
               "value": "in_progress"
             },
-            "tool_call": "name\": \"knowledge_search\", \"parameters",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\": {\"query\": \"NBA",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " creation date\"}}",
+            "tool_call": " {\"query\": \"NBA creation date\"}}",
             "type": "tool_call"
           },
           "event_type": {
@@ -10469,7 +13879,7 @@
               "arguments": {
                 "query": "NBA creation date"
               },
-              "call_id": "34cb848d-3c9f-4f70-9b1c-8fd4f8455f00",
+              "call_id": "9ffcb7be-c9ba-478a-af1c-8f68d4033c4f",
               "tool_name": "knowledge_search"
             },
             "type": "tool_call"
diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.pickle b/tests/integration/fixtures/recorded_responses/chat_completion.pickle
index aef1aa45d..4abc0c17e 100644
Binary files a/tests/integration/fixtures/recorded_responses/chat_completion.pickle and b/tests/integration/fixtures/recorded_responses/chat_completion.pickle differ
diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.json b/tests/integration/fixtures/recorded_responses/invoke_tool.json
index 2dde8c83c..7d56a829a 100644
--- a/tests/integration/fixtures/recorded_responses/invoke_tool.json
+++ b/tests/integration/fixtures/recorded_responses/invoke_tool.json
@@ -71,6 +71,15 @@
       "metadata": null
     }
   },
+  "()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Convert \\'Year\\' column to datetime\\ndf[\\'Year\\'] = pd.to_datetime(df[\\'Year\\'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(\\'Year\\')[\\'Inflation\\'].mean().reset_index()\\n\\n# Plot average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation[\\'Year\\'], average_inflation[\\'Inflation\\'], marker=\\'o\\')\\nplt.title(\\'Average Yearly Inflation\\')\\nplt.xlabel(\\'Year\\')\\nplt.ylabel(\\'Inflation Rate\\')\\nplt.grid(True)\\nplt.show()'}), ('tool_name', 'code_interpreter')]": {
+    "type": "value",
+    "value": {
+      "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+      "error_code": null,
+      "error_message": null,
+      "metadata": null
+    }
+  },
   "()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Convert the \\'Year\\' column to datetime\\ndf[\\'Year\\'] = pd.to_datetime(df[\\'Year\\'], format=\\'%Y\\')\\n\\n# Group by \\'Year\\' and calculate the average inflation\\ndf_avg_inflation = df.groupby(\\'Year\\')[\\'Inflation\\'].mean().reset_index()\\n\\n# Plot the average inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation[\\'Year\\'], df_avg_inflation[\\'Inflation\\'], marker=\\'o\\')\\nplt.title(\\'Average Yearly Inflation\\')\\nplt.xlabel(\\'Year\\')\\nplt.ylabel(\\'Inflation\\')\\nplt.grid(True)\\nplt.show()'}), ('tool_name', 'code_interpreter')]": {
     "type": "value",
     "value": {
@@ -98,23 +107,23 @@
           "type": "text"
         },
         {
-          "text": "Result 1:\nDocument_id:cbc88\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+          "text": "Result 1:\nDocument_id:64211\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
           "type": "text"
         },
         {
-          "text": "Result 2:\nDocument_id:cbc88\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+          "text": "Result 2:\nDocument_id:64211\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
           "type": "text"
         },
         {
-          "text": "Result 3:\nDocument_id:8892b\nContent:  with training with LoRA quickly,\njust specify any config with ``_lora`` in its name, e.g:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device\n\n\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\nwhich linear layers LoRA should be applied to in the model:\n\n* ``lora_attn_modules: List[str]`` accepts a list of strings specifying which layers of the model to apply\n  LoRA to:\n\n  * ``q_proj`` applies LoRA to the query projection layer.\n  * ``k_proj`` applies LoRA to the key projection layer.\n  * ``v_proj`` applies LoRA to the value projection layer.\n  * ``output_proj`` applies LoRA to the attention output projection layer.\n\n  Whilst adding more layers to be fine-tuned may improve model accuracy,\n  this will come at the cost of increased memory usage and reduced training speed.\n\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\n* ``apply_lora_to_output: Bool`` applies LoRA to the model's final output projection.\n  This is usually a projection to vocabulary space (e.g. in language models), but\n  other modelling tasks may have different projections - classifier models will project\n  to the number of classes, for example\n\n.. note::\n\n  Models which use tied embeddings (such as Gemma and Qwen2 1.5B and 0.5B) for the\n  final output projection do not support ``apply_lora_to_output``.\n\nThese are all specified under the ``model`` flag or config entry, i.e:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"]\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.llama3.lora_llama3_8b\n    apply_lora_to_mlp: True\n    model.lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\n\nSecondly, parameters which control the scale of the impact of LoRA on the model:\n\n* ``lora_rank: int`` affects the scale of\n",
+          "text": "Result 3:\nDocument_id:0c95c\nContent:  with training with LoRA quickly,\njust specify any config with ``_lora`` in its name, e.g:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device\n\n\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\nwhich linear layers LoRA should be applied to in the model:\n\n* ``lora_attn_modules: List[str]`` accepts a list of strings specifying which layers of the model to apply\n  LoRA to:\n\n  * ``q_proj`` applies LoRA to the query projection layer.\n  * ``k_proj`` applies LoRA to the key projection layer.\n  * ``v_proj`` applies LoRA to the value projection layer.\n  * ``output_proj`` applies LoRA to the attention output projection layer.\n\n  Whilst adding more layers to be fine-tuned may improve model accuracy,\n  this will come at the cost of increased memory usage and reduced training speed.\n\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\n* ``apply_lora_to_output: Bool`` applies LoRA to the model's final output projection.\n  This is usually a projection to vocabulary space (e.g. in language models), but\n  other modelling tasks may have different projections - classifier models will project\n  to the number of classes, for example\n\n.. note::\n\n  Models which use tied embeddings (such as Gemma and Qwen2 1.5B and 0.5B) for the\n  final output projection do not support ``apply_lora_to_output``.\n\nThese are all specified under the ``model`` flag or config entry, i.e:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"]\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.llama3.lora_llama3_8b\n    apply_lora_to_mlp: True\n    model.lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\n\nSecondly, parameters which control the scale of the impact of LoRA on the model:\n\n* ``lora_rank: int`` affects the scale of\n",
           "type": "text"
         },
         {
-          "text": "Result 4:\nDocument_id:cbc88\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
+          "text": "Result 4:\nDocument_id:64211\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
           "type": "text"
         },
         {
-          "text": "Result 5:\nDocument_id:9dcb7\nContent: ora_finetune_label>`.\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial <qlora_finetune_label>`.\n\nLet's take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\n\n.. code-block:: bash\n\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device\n\n.. note::\n    To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\n\nWe can also add :ref:`command-line overrides <cli_override>` as needed, e.g.\n\n.. code-block:: bash\n\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n        checkpointer.checkpoint_dir=<checkpoint_dir> \\\n        tokenizer.path=<checkpoint_dir>/tokenizer.model \\\n        checkpointer.output_dir=<checkpoint_dir>\n\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ``<checkpoint_dir>`` used in the :ref:`tune download <tune_download_label>` command above,\nthen save a final checkpoint in the same directory following the original format. For more details on the\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive <understand_checkpointer>`.\n\n.. note::\n    To see the full set of configurable parameters for this (and other) configs we can use :ref:`tune cp <tune_cp_cli_label>` to copy (and modify)\n    the default config. :ref:`tune cp <tune_cp_cli_label>` can be used with recipe scripts too, in case you want to make more custom changes\n    that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp <tune_cp_cli_label>` see the section on\n    :ref:`modifying configs <tune_cp_label>` in our \":ref:`finetune_llama_label`\" tutorial.\n\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights\nwill\n",
+          "text": "Result 5:\nDocument_id:1d70c\nContent: ora_finetune_label>`.\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial <qlora_finetune_label>`.\n\nLet's take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\n\n.. code-block:: bash\n\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device\n\n.. note::\n    To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\n\nWe can also add :ref:`command-line overrides <cli_override>` as needed, e.g.\n\n.. code-block:: bash\n\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n        checkpointer.checkpoint_dir=<checkpoint_dir> \\\n        tokenizer.path=<checkpoint_dir>/tokenizer.model \\\n        checkpointer.output_dir=<checkpoint_dir>\n\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ``<checkpoint_dir>`` used in the :ref:`tune download <tune_download_label>` command above,\nthen save a final checkpoint in the same directory following the original format. For more details on the\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive <understand_checkpointer>`.\n\n.. note::\n    To see the full set of configurable parameters for this (and other) configs we can use :ref:`tune cp <tune_cp_cli_label>` to copy (and modify)\n    the default config. :ref:`tune cp <tune_cp_cli_label>` can be used with recipe scripts too, in case you want to make more custom changes\n    that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp <tune_cp_cli_label>` see the section on\n    :ref:`modifying configs <tune_cp_label>` in our \":ref:`finetune_llama_label`\" tutorial.\n\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights\nwill\n",
           "type": "text"
         },
         {
@@ -126,11 +135,11 @@
       "error_message": null,
       "metadata": {
         "document_ids": [
-          "cbc884b1-9d88-4d5c-aff4-7a4b3a56618c",
-          "cbc884b1-9d88-4d5c-aff4-7a4b3a56618c",
-          "8892b092-6394-471e-b143-a23c6cc374f8",
-          "cbc884b1-9d88-4d5c-aff4-7a4b3a56618c",
-          "9dcb747d-0627-40cc-a23c-0bee2b6b05af"
+          "6421150d-d334-4163-a058-3818b2b742e9",
+          "6421150d-d334-4163-a058-3818b2b742e9",
+          "0c95cff3-5612-40cf-a73d-77644a2462d0",
+          "6421150d-d334-4163-a058-3818b2b742e9",
+          "1d70c86d-4cdf-4be9-a1f2-8a271b15ce2c"
         ]
       }
     }
@@ -298,23 +307,23 @@
           "type": "text"
         },
         {
-          "text": "Result 1:\nDocument_id:3e3a0\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
+          "text": "Result 1:\nDocument_id:7bdfa\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
           "type": "text"
         },
         {
-          "text": "Result 2:\nDocument_id:7da0c\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+          "text": "Result 2:\nDocument_id:64211\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
           "type": "text"
         },
         {
-          "text": "Result 3:\nDocument_id:fd0f6\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+          "text": "Result 3:\nDocument_id:0c95c\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
           "type": "text"
         },
         {
-          "text": "Result 4:\nDocument_id:7da0c\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+          "text": "Result 4:\nDocument_id:64211\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
           "type": "text"
         },
         {
-          "text": "Result 5:\nDocument_id:fd0f6\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+          "text": "Result 5:\nDocument_id:0c95c\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
           "type": "text"
         },
         {
@@ -326,11 +335,11 @@
       "error_message": null,
       "metadata": {
         "document_ids": [
-          "3e3a05a7-23d4-461e-a304-8aa7cb35a4f5",
-          "7da0c755-7ffa-4c1a-9ab0-cfdda7cce00f",
-          "fd0f6ee9-15d2-43b3-8500-25bc5bdfd365",
-          "7da0c755-7ffa-4c1a-9ab0-cfdda7cce00f",
-          "fd0f6ee9-15d2-43b3-8500-25bc5bdfd365"
+          "7bdfad34-d546-4e98-9757-a0289696cd97",
+          "6421150d-d334-4163-a058-3818b2b742e9",
+          "0c95cff3-5612-40cf-a73d-77644a2462d0",
+          "6421150d-d334-4163-a058-3818b2b742e9",
+          "0c95cff3-5612-40cf-a73d-77644a2462d0"
         ]
       }
     }
diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.pickle b/tests/integration/fixtures/recorded_responses/invoke_tool.pickle
index a642db721..bb7fb29d7 100644
Binary files a/tests/integration/fixtures/recorded_responses/invoke_tool.pickle and b/tests/integration/fixtures/recorded_responses/invoke_tool.pickle differ