From 0fdb15bcc74f236cd4e6ac4291a361a08b6bf1b3 Mon Sep 17 00:00:00 2001
From: Dinesh Yeduguru <yvdinesh@gmail.com>
Date: Wed, 12 Mar 2025 13:26:23 -0700
Subject: [PATCH 01/38] fix: fix build error in context.py (#1595)

# What does this PR do?
This fixes the build error


## Test Plan
pre-commit run --all-files
check for merge
conflicts................................................Passed
trim trailing
whitespace.................................................Passed
check for added large
files..............................................Passed
fix end of
files.........................................................Passed
Insert license in
comments...............................................Passed

ruff.....................................................................Passed

ruff-format..............................................................Passed

blacken-docs.............................................................Passed

uv-lock..................................................................Passed

uv-export................................................................Passed

mypy.....................................................................Passed
Distribution Template
Codegen............................................Passed
---
 llama_stack/distribution/utils/context.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_stack/distribution/utils/context.py b/llama_stack/distribution/utils/context.py
index 107ce7127..2f32afba2 100644
--- a/llama_stack/distribution/utils/context.py
+++ b/llama_stack/distribution/utils/context.py
@@ -19,7 +19,7 @@ def preserve_contexts_async_generator(
     and we need to preserve the context across the event loop boundary.
     """
 
-    async def wrapper():
+    async def wrapper() -> AsyncGenerator[T, None]:
         while True:
             try:
                 item = await gen.__anext__()

From 1311faf3f5e7e18111b642be6bbdd941c2034e02 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Wed, 12 Mar 2025 14:57:31 -0700
Subject: [PATCH 02/38] fix: logging (#1598)

Summary:

Test Plan:
---
 llama_stack/distribution/routers/routers.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py
index 68b8e55cb..34102d04b 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@@ -238,7 +238,6 @@ class InferenceRouter(Inference):
         tool_config: Optional[ToolConfig] = None,
     ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
         logger.debug(
-            "core",
             f"InferenceRouter.chat_completion: {model_id=}, {stream=}, {messages=}, {tools=}, {tool_config=}, {response_format=}",
         )
         if sampling_params is None:

From ad939c97c37f8e33d0e94fe43893773cffe7618e Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <31703736+nathan-weinberg@users.noreply.github.com>
Date: Wed, 12 Mar 2025 18:41:35 -0400
Subject: [PATCH 03/38] docs: add unit test badge to README  (#1591)

# What does this PR do?
This PR adds a simple unit test badge to the project README

It also modifies the workflow to run on merges to main, so that the
status reflected in the README is that of main and not pull request
branches

---------

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .github/workflows/unit-tests.yml | 2 ++
 README.md                        | 1 +
 2 files changed, 3 insertions(+)

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 39505ba11..59d18b3be 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -1,6 +1,8 @@
 name: Unit Tests
 
 on:
+  push:
+    branches: [ main ]
   pull_request:
     branches: [ main ]
   workflow_dispatch:
diff --git a/README.md b/README.md
index b24e69514..6e1fd088e 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,7 @@
 [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/)
 [![License](https://img.shields.io/pypi/l/llama_stack.svg)](https://github.com/meta-llama/llama-stack/blob/main/LICENSE)
 [![Discord](https://img.shields.io/discord/1257833999603335178)](https://discord.gg/llama-stack)
+![Unit](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)
 
 [**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb)
 

From 99bbe0e70b125f93da659ca722a9d5c2f6ef7022 Mon Sep 17 00:00:00 2001
From: Dinesh Yeduguru <yvdinesh@gmail.com>
Date: Wed, 12 Mar 2025 15:45:44 -0700
Subject: [PATCH 04/38] feat: Add new compact MetricInResponse type (#1593)

# What does this PR do?
This change adds a compact type to include metrics in response as
opposed to the full MetricEvent which is relevant for internal logging
purposes.

## Test Plan
```
LLAMA_STACK_CONFIG=~/.llama/distributions/fireworks/fireworks-run.yaml pytest -s -v agents/test_agents.py --safety-shield meta-llama/Llama-Guard-3-8B --text-model meta-llama/Llama-3.1-8B-Instruct

 llama stack run ~/.llama/distributions/fireworks/fireworks-run.yaml

curl --request POST \
  --url http://localhost:8321/v1/inference/chat-completion \
  --header 'content-type: application/json' \
  --data '{
  "model_id": "meta-llama/Llama-3.1-70B-Instruct",
  "messages": [
    {
      "role": "user",
      "content": {
        "type": "text",
        "text": "where do humans live"
      }
    }
  ],
  "stream": false
}'

{
  "metrics": [
    {
      "metric": "prompt_tokens",
      "value": 10,
      "unit": null
    },
    {
      "metric": "completion_tokens",
      "value": 522,
      "unit": null
    },
    {
      "metric": "total_tokens",
      "value": 532,
      "unit": null
    }
  ],
  "completion_message": {
    "role": "assistant",
    "content": "Humans live in various parts of the world...............",
    "stop_reason": "out_of_tokens",
    "tool_calls": []
  },
  "logprobs": null
}
```
---
 docs/_static/llama-stack-spec.html          | 133 +++++++++++++-------
 docs/_static/llama-stack-spec.yaml          |  82 +++++++-----
 llama_stack/apis/telemetry/telemetry.py     |   9 +-
 llama_stack/distribution/routers/routers.py |   6 +-
 4 files changed, 150 insertions(+), 80 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 709360ede..dbd530aa3 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -4549,7 +4549,7 @@
                     "metrics": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/MetricEvent"
+                            "$ref": "#/components/schemas/MetricInResponse"
                         }
                     },
                     "completion_message": {
@@ -4571,46 +4571,9 @@
                 "title": "ChatCompletionResponse",
                 "description": "Response from a chat completion request."
             },
-            "MetricEvent": {
+            "MetricInResponse": {
                 "type": "object",
                 "properties": {
-                    "trace_id": {
-                        "type": "string"
-                    },
-                    "span_id": {
-                        "type": "string"
-                    },
-                    "timestamp": {
-                        "type": "string",
-                        "format": "date-time"
-                    },
-                    "attributes": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "integer"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "null"
-                                }
-                            ]
-                        }
-                    },
-                    "type": {
-                        "type": "string",
-                        "const": "metric",
-                        "default": "metric"
-                    },
                     "metric": {
                         "type": "string"
                     },
@@ -4630,15 +4593,10 @@
                 },
                 "additionalProperties": false,
                 "required": [
-                    "trace_id",
-                    "span_id",
-                    "timestamp",
-                    "type",
                     "metric",
-                    "value",
-                    "unit"
+                    "value"
                 ],
-                "title": "MetricEvent"
+                "title": "MetricInResponse"
             },
             "TokenLogProbs": {
                 "type": "object",
@@ -4715,6 +4673,12 @@
             "CompletionResponse": {
                 "type": "object",
                 "properties": {
+                    "metrics": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/MetricInResponse"
+                        }
+                    },
                     "content": {
                         "type": "string",
                         "description": "The generated completion text"
@@ -4924,7 +4888,7 @@
                     "metrics": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/MetricEvent"
+                            "$ref": "#/components/schemas/MetricInResponse"
                         }
                     },
                     "event": {
@@ -5082,6 +5046,12 @@
             "CompletionResponseStreamChunk": {
                 "type": "object",
                 "properties": {
+                    "metrics": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/MetricInResponse"
+                        }
+                    },
                     "delta": {
                         "type": "string",
                         "description": "New content generated since last chunk. This can be one or more tokens."
@@ -8363,6 +8333,75 @@
                 ],
                 "title": "LogSeverity"
             },
+            "MetricEvent": {
+                "type": "object",
+                "properties": {
+                    "trace_id": {
+                        "type": "string"
+                    },
+                    "span_id": {
+                        "type": "string"
+                    },
+                    "timestamp": {
+                        "type": "string",
+                        "format": "date-time"
+                    },
+                    "attributes": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "integer"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "null"
+                                }
+                            ]
+                        }
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "metric",
+                        "default": "metric"
+                    },
+                    "metric": {
+                        "type": "string"
+                    },
+                    "value": {
+                        "oneOf": [
+                            {
+                                "type": "integer"
+                            },
+                            {
+                                "type": "number"
+                            }
+                        ]
+                    },
+                    "unit": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "trace_id",
+                    "span_id",
+                    "timestamp",
+                    "type",
+                    "metric",
+                    "value",
+                    "unit"
+                ],
+                "title": "MetricEvent"
+            },
             "SpanEndPayload": {
                 "type": "object",
                 "properties": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 4c00fbe63..cca1872a4 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -3101,7 +3101,7 @@ components:
         metrics:
           type: array
           items:
-            $ref: '#/components/schemas/MetricEvent'
+            $ref: '#/components/schemas/MetricInResponse'
         completion_message:
           $ref: '#/components/schemas/CompletionMessage'
           description: The complete response message
@@ -3116,29 +3116,9 @@ components:
         - completion_message
       title: ChatCompletionResponse
       description: Response from a chat completion request.
-    MetricEvent:
+    MetricInResponse:
       type: object
       properties:
-        trace_id:
-          type: string
-        span_id:
-          type: string
-        timestamp:
-          type: string
-          format: date-time
-        attributes:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: string
-              - type: integer
-              - type: number
-              - type: boolean
-              - type: 'null'
-        type:
-          type: string
-          const: metric
-          default: metric
         metric:
           type: string
         value:
@@ -3149,14 +3129,9 @@ components:
           type: string
       additionalProperties: false
       required:
-        - trace_id
-        - span_id
-        - timestamp
-        - type
         - metric
         - value
-        - unit
-      title: MetricEvent
+      title: MetricInResponse
     TokenLogProbs:
       type: object
       properties:
@@ -3213,6 +3188,10 @@ components:
     CompletionResponse:
       type: object
       properties:
+        metrics:
+          type: array
+          items:
+            $ref: '#/components/schemas/MetricInResponse'
         content:
           type: string
           description: The generated completion text
@@ -3412,7 +3391,7 @@ components:
         metrics:
           type: array
           items:
-            $ref: '#/components/schemas/MetricEvent'
+            $ref: '#/components/schemas/MetricInResponse'
         event:
           $ref: '#/components/schemas/ChatCompletionResponseEvent'
           description: The event containing the new content
@@ -3531,6 +3510,10 @@ components:
     CompletionResponseStreamChunk:
       type: object
       properties:
+        metrics:
+          type: array
+          items:
+            $ref: '#/components/schemas/MetricInResponse'
         delta:
           type: string
           description: >-
@@ -5703,6 +5686,47 @@ components:
         - error
         - critical
       title: LogSeverity
+    MetricEvent:
+      type: object
+      properties:
+        trace_id:
+          type: string
+        span_id:
+          type: string
+        timestamp:
+          type: string
+          format: date-time
+        attributes:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: string
+              - type: integer
+              - type: number
+              - type: boolean
+              - type: 'null'
+        type:
+          type: string
+          const: metric
+          default: metric
+        metric:
+          type: string
+        value:
+          oneOf:
+            - type: integer
+            - type: number
+        unit:
+          type: string
+      additionalProperties: false
+      required:
+        - trace_id
+        - span_id
+        - timestamp
+        - type
+        - metric
+        - value
+        - unit
+      title: MetricEvent
     SpanEndPayload:
       type: object
       properties:
diff --git a/llama_stack/apis/telemetry/telemetry.py b/llama_stack/apis/telemetry/telemetry.py
index fe75677e7..cbea57e79 100644
--- a/llama_stack/apis/telemetry/telemetry.py
+++ b/llama_stack/apis/telemetry/telemetry.py
@@ -96,6 +96,13 @@ class MetricEvent(EventCommon):
     unit: str
 
 
+@json_schema_type
+class MetricInResponse(BaseModel):
+    metric: str
+    value: Union[int, float]
+    unit: Optional[str] = None
+
+
 # This is a short term solution to allow inference API to return metrics
 # The ideal way to do this is to have a way for all response types to include metrics
 # and all metric events logged to the telemetry API to be inlcuded with the response
@@ -117,7 +124,7 @@ class MetricEvent(EventCommon):
 
 
 class MetricResponseMixin(BaseModel):
-    metrics: Optional[List[MetricEvent]] = None
+    metrics: Optional[List[MetricInResponse]] = None
 
 
 @json_schema_type
diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py
index 34102d04b..22a1e46f9 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@@ -48,7 +48,7 @@ from llama_stack.apis.scoring import (
     ScoringFnParams,
 )
 from llama_stack.apis.shields import Shield
-from llama_stack.apis.telemetry import MetricEvent, Telemetry
+from llama_stack.apis.telemetry import MetricEvent, MetricInResponse, Telemetry
 from llama_stack.apis.tools import (
     RAGDocument,
     RAGQueryConfig,
@@ -206,12 +206,12 @@ class InferenceRouter(Inference):
         completion_tokens: int,
         total_tokens: int,
         model: Model,
-    ) -> List[MetricEvent]:
+    ) -> List[MetricInResponse]:
         metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model)
         if self.telemetry:
             for metric in metrics:
                 await self.telemetry.log_event(metric)
-        return metrics
+        return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]
 
     async def _count_tokens(
         self,

From 41c9bca1aa7a44cf2048b6c9371cd7740d2e47c1 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Wed, 12 Mar 2025 18:48:03 -0700
Subject: [PATCH 05/38] chore: refactor Agent toolgroup processing (#1381)

Summary:
Refactoring only.

Centralize logic to preprocess toolgroup to one place.

Test Plan:
LLAMA_STACK_CONFIG=fireworks pytest -s -v
tests/api/agents/test_agents.py --safety-shield
meta-llama/Llama-Guard-3-8B
---
[//]: # (BEGIN SAPLING FOOTER)
Stack created with [Sapling](https://sapling-scm.com). Best reviewed
with
[ReviewStack](https://reviewstack.dev/meta-llama/llama-stack/pull/1381).
* #1384
* __->__ #1381
---
 .../agents/meta_reference/agent_instance.py   | 120 ++++++++----------
 1 file changed, 55 insertions(+), 65 deletions(-)

diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
index 1d9f54e96..1884094df 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@@ -12,7 +12,7 @@ import secrets
 import string
 import uuid
 from datetime import datetime
-from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
+from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union
 from urllib.parse import urlparse
 
 import httpx
@@ -181,6 +181,7 @@ class ChatAgent(ShieldRunnerMixin):
         return messages
 
     async def create_and_execute_turn(self, request: AgentTurnCreateRequest) -> AsyncGenerator:
+        await self._initialize_tools(request.toolgroups)
         async with tracing.span("create_and_execute_turn") as span:
             span.set_attribute("session_id", request.session_id)
             span.set_attribute("agent_id", self.agent_id)
@@ -191,6 +192,7 @@ class ChatAgent(ShieldRunnerMixin):
                 yield chunk
 
     async def resume_turn(self, request: AgentTurnResumeRequest) -> AsyncGenerator:
+        await self._initialize_tools()
         async with tracing.span("resume_turn") as span:
             span.set_attribute("agent_id", self.agent_id)
             span.set_attribute("session_id", request.session_id)
@@ -275,7 +277,6 @@ class ChatAgent(ShieldRunnerMixin):
             sampling_params=self.agent_config.sampling_params,
             stream=request.stream,
             documents=request.documents if not is_resume else None,
-            toolgroups_for_turn=request.toolgroups if not is_resume else None,
         ):
             if isinstance(chunk, CompletionMessage):
                 output_message = chunk
@@ -327,7 +328,6 @@ class ChatAgent(ShieldRunnerMixin):
         sampling_params: SamplingParams,
         stream: bool = False,
         documents: Optional[List[Document]] = None,
-        toolgroups_for_turn: Optional[List[AgentToolGroup]] = None,
     ) -> AsyncGenerator:
         # Doing async generators makes downstream code much simpler and everything amenable to
         # streaming. However, it also makes things complicated here because AsyncGenerators cannot
@@ -350,7 +350,6 @@ class ChatAgent(ShieldRunnerMixin):
             sampling_params,
             stream,
             documents,
-            toolgroups_for_turn,
         ):
             if isinstance(res, bool):
                 return
@@ -451,30 +450,17 @@ class ChatAgent(ShieldRunnerMixin):
         sampling_params: SamplingParams,
         stream: bool = False,
         documents: Optional[List[Document]] = None,
-        toolgroups_for_turn: Optional[List[AgentToolGroup]] = None,
     ) -> AsyncGenerator:
-        # TODO: simplify all of this code, it can be simpler
-        toolgroup_args = {}
-        toolgroups = set()
-        for toolgroup in self.agent_config.toolgroups + (toolgroups_for_turn or []):
-            if isinstance(toolgroup, AgentToolGroupWithArgs):
-                tool_group_name, tool_name = self._parse_toolgroup_name(toolgroup.name)
-                toolgroups.add(tool_group_name)
-                toolgroup_args[tool_group_name] = toolgroup.args
-            else:
-                toolgroups.add(toolgroup)
-
-        tool_defs, tool_to_group = await self._get_tool_defs(toolgroups_for_turn)
         if documents:
-            await self.handle_documents(session_id, documents, input_messages, tool_defs)
+            await self.handle_documents(session_id, documents, input_messages)
 
         session_info = await self.storage.get_session_info(session_id)
         # if the session has a memory bank id, let the memory tool use it
         if session_info and session_info.vector_db_id:
-            if RAG_TOOL_GROUP not in toolgroup_args:
-                toolgroup_args[RAG_TOOL_GROUP] = {"vector_db_ids": [session_info.vector_db_id]}
+            if RAG_TOOL_GROUP not in self.toolgroup_to_args:
+                self.toolgroup_to_args[RAG_TOOL_GROUP] = {"vector_db_ids": [session_info.vector_db_id]}
             else:
-                toolgroup_args[RAG_TOOL_GROUP]["vector_db_ids"].append(session_info.vector_db_id)
+                self.toolgroup_to_args[RAG_TOOL_GROUP]["vector_db_ids"].append(session_info.vector_db_id)
 
         output_attachments = []
 
@@ -504,7 +490,7 @@ class ChatAgent(ShieldRunnerMixin):
                 async for chunk in await self.inference_api.chat_completion(
                     self.agent_config.model,
                     input_messages,
-                    tools=tool_defs,
+                    tools=self.tool_defs,
                     tool_prompt_format=self.agent_config.tool_config.tool_prompt_format,
                     response_format=self.agent_config.response_format,
                     stream=True,
@@ -686,12 +672,9 @@ class ChatAgent(ShieldRunnerMixin):
                 ) as span:
                     tool_execution_start_time = datetime.now().astimezone().isoformat()
                     tool_call = message.tool_calls[0]
-                    tool_result = await execute_tool_call_maybe(
-                        self.tool_runtime_api,
+                    tool_result = await self.execute_tool_call_maybe(
                         session_id,
                         tool_call,
-                        toolgroup_args,
-                        tool_to_group,
                     )
                     if tool_result.content is None:
                         raise ValueError(
@@ -744,6 +727,15 @@ class ChatAgent(ShieldRunnerMixin):
 
                 input_messages = input_messages + [message, result_message]
 
+    async def _initialize_tools(self, toolgroups_for_turn: Optional[List[AgentToolGroup]] = None):
+        self.toolgroup_to_args = {}
+        for toolgroup in self.agent_config.toolgroups + (toolgroups_for_turn or []):
+            if isinstance(toolgroup, AgentToolGroupWithArgs):
+                tool_group_name, _ = self._parse_toolgroup_name(toolgroup.name)
+                self.toolgroup_to_args[tool_group_name] = toolgroup.args
+
+        self.tool_defs, self.tool_name_to_group_id = await self._get_tool_defs(toolgroups_for_turn)
+
     async def _get_tool_defs(
         self, toolgroups_for_turn: Optional[List[AgentToolGroup]] = None
     ) -> Tuple[List[ToolDefinition], Dict[str, str]]:
@@ -756,7 +748,7 @@ class ChatAgent(ShieldRunnerMixin):
                 agent_config_toolgroups.append(name)
 
         tool_name_to_def = {}
-        tool_to_group = {}
+        tool_name_to_group_id = {}
 
         for tool_def in self.agent_config.client_tools:
             if tool_name_to_def.get(tool_def.name, None):
@@ -774,7 +766,7 @@ class ChatAgent(ShieldRunnerMixin):
                     for param in tool_def.parameters
                 },
             )
-            tool_to_group[tool_def.name] = "__client_tools__"
+            tool_name_to_group_id[tool_def.name] = "__client_tools__"
         for toolgroup_name_with_maybe_tool_name in agent_config_toolgroups:
             toolgroup_name, tool_name = self._parse_toolgroup_name(toolgroup_name_with_maybe_tool_name)
             tools = await self.tool_groups_api.list_tools(toolgroup_id=toolgroup_name)
@@ -813,7 +805,7 @@ class ChatAgent(ShieldRunnerMixin):
                             for param in tool_def.parameters
                         },
                     )
-                    tool_to_group[built_in_type] = tool_def.toolgroup_id
+                    tool_name_to_group_id[built_in_type] = tool_def.toolgroup_id
                     continue
 
                 if tool_name_to_def.get(tool_def.identifier, None):
@@ -832,9 +824,9 @@ class ChatAgent(ShieldRunnerMixin):
                             for param in tool_def.parameters
                         },
                     )
-                    tool_to_group[tool_def.identifier] = tool_def.toolgroup_id
+                    tool_name_to_group_id[tool_def.identifier] = tool_def.toolgroup_id
 
-        return list(tool_name_to_def.values()), tool_to_group
+        return list(tool_name_to_def.values()), tool_name_to_group_id
 
     def _parse_toolgroup_name(self, toolgroup_name_with_maybe_tool_name: str) -> tuple[str, Optional[str]]:
         """Parse a toolgroup name into its components.
@@ -853,15 +845,44 @@ class ChatAgent(ShieldRunnerMixin):
             tool_group, tool_name = split_names[0], None
         return tool_group, tool_name
 
+    async def execute_tool_call_maybe(
+        self,
+        session_id: str,
+        tool_call: ToolCall,
+    ) -> ToolInvocationResult:
+        name = tool_call.tool_name
+        group_name = self.tool_name_to_group_id.get(name, None)
+        if group_name is None:
+            raise ValueError(
+                f"Tool {name} not found in any tool group, available tools: {', '.join(self.tool_name_to_group_id.keys())}"
+            )
+        if isinstance(name, BuiltinTool):
+            if name == BuiltinTool.brave_search:
+                name = WEB_SEARCH_TOOL
+            else:
+                name = name.value
+
+        logger.info(f"executing tool call: {name} with args: {tool_call.arguments}")
+        result = await self.tool_runtime_api.invoke_tool(
+            tool_name=name,
+            kwargs={
+                "session_id": session_id,
+                # get the arguments generated by the model and augment with toolgroup arg overrides for the agent
+                **tool_call.arguments,
+                **self.toolgroup_to_args.get(group_name, {}),
+            },
+        )
+        logger.debug(f"tool call {name} completed with result: {result}")
+        return result
+
     async def handle_documents(
         self,
         session_id: str,
         documents: List[Document],
         input_messages: List[Message],
-        tool_defs: Dict[str, ToolDefinition],
     ) -> None:
-        memory_tool = any(tool_def.tool_name == MEMORY_QUERY_TOOL for tool_def in tool_defs)
-        code_interpreter_tool = any(tool_def.tool_name == BuiltinTool.code_interpreter for tool_def in tool_defs)
+        memory_tool = any(tool_def.tool_name == MEMORY_QUERY_TOOL for tool_def in self.tool_defs)
+        code_interpreter_tool = any(tool_def.tool_name == BuiltinTool.code_interpreter for tool_def in self.tool_defs)
         content_items = []
         url_items = []
         pattern = re.compile("^(https?://|file://|data:)")
@@ -994,37 +1015,6 @@ async def attachment_message(tempdir: str, urls: List[URL]) -> ToolResponseMessa
     )
 
 
-async def execute_tool_call_maybe(
-    tool_runtime_api: ToolRuntime,
-    session_id: str,
-    tool_call: ToolCall,
-    toolgroup_args: Dict[str, Dict[str, Any]],
-    tool_to_group: Dict[str, str],
-) -> ToolInvocationResult:
-    name = tool_call.tool_name
-    group_name = tool_to_group.get(name, None)
-    if group_name is None:
-        raise ValueError(f"Tool {name} not found in any tool group")
-    if isinstance(name, BuiltinTool):
-        if name == BuiltinTool.brave_search:
-            name = WEB_SEARCH_TOOL
-        else:
-            name = name.value
-
-    logger.info(f"executing tool call: {name} with args: {tool_call.arguments}")
-    result = await tool_runtime_api.invoke_tool(
-        tool_name=name,
-        kwargs={
-            "session_id": session_id,
-            # get the arguments generated by the model and augment with toolgroup arg overrides for the agent
-            **tool_call.arguments,
-            **toolgroup_args.get(group_name, {}),
-        },
-    )
-    logger.info(f"tool call {name} completed with result: {result}")
-    return result
-
-
 def _interpret_content_as_attachment(
     content: str,
 ) -> Optional[Attachment]:

From ed6caead724aa8b5b1c4e53528e120888517a812 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Wed, 12 Mar 2025 18:51:18 -0700
Subject: [PATCH 06/38] chore: simplify _get_tool_defs (#1384)

Summary:

Test Plan:
LLAMA_STACK_CONFIG=fireworks pytest -s -v
tests/integration/agents/test_agents.py --safety-shield
meta-llama/Llama-Guard-3-8B --text-model
meta-llama/Llama-3.1-8B-Instruct
---
 .../agents/meta_reference/agent_instance.py   | 111 ++++++++----------
 1 file changed, 50 insertions(+), 61 deletions(-)

diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
index 1884094df..3f09cacc0 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@@ -12,7 +12,7 @@ import secrets
 import string
 import uuid
 from datetime import datetime
-from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union
+from typing import AsyncGenerator, List, Optional, Union
 from urllib.parse import urlparse
 
 import httpx
@@ -457,10 +457,12 @@ class ChatAgent(ShieldRunnerMixin):
         session_info = await self.storage.get_session_info(session_id)
         # if the session has a memory bank id, let the memory tool use it
         if session_info and session_info.vector_db_id:
-            if RAG_TOOL_GROUP not in self.toolgroup_to_args:
-                self.toolgroup_to_args[RAG_TOOL_GROUP] = {"vector_db_ids": [session_info.vector_db_id]}
-            else:
-                self.toolgroup_to_args[RAG_TOOL_GROUP]["vector_db_ids"].append(session_info.vector_db_id)
+            for tool_name in self.tool_name_to_args.keys():
+                if tool_name == MEMORY_QUERY_TOOL:
+                    if "vector_db_ids" not in self.tool_name_to_args[tool_name]:
+                        self.tool_name_to_args[tool_name]["vector_db_ids"] = [session_info.vector_db_id]
+                    else:
+                        self.tool_name_to_args[tool_name]["vector_db_ids"].append(session_info.vector_db_id)
 
         output_attachments = []
 
@@ -727,18 +729,16 @@ class ChatAgent(ShieldRunnerMixin):
 
                 input_messages = input_messages + [message, result_message]
 
-    async def _initialize_tools(self, toolgroups_for_turn: Optional[List[AgentToolGroup]] = None):
-        self.toolgroup_to_args = {}
-        for toolgroup in self.agent_config.toolgroups + (toolgroups_for_turn or []):
+    async def _initialize_tools(
+        self,
+        toolgroups_for_turn: Optional[List[AgentToolGroup]] = None,
+    ) -> None:
+        toolgroup_to_args = {}
+        for toolgroup in (self.agent_config.toolgroups or []) + (toolgroups_for_turn or []):
             if isinstance(toolgroup, AgentToolGroupWithArgs):
                 tool_group_name, _ = self._parse_toolgroup_name(toolgroup.name)
-                self.toolgroup_to_args[tool_group_name] = toolgroup.args
+                toolgroup_to_args[tool_group_name] = toolgroup.args
 
-        self.tool_defs, self.tool_name_to_group_id = await self._get_tool_defs(toolgroups_for_turn)
-
-    async def _get_tool_defs(
-        self, toolgroups_for_turn: Optional[List[AgentToolGroup]] = None
-    ) -> Tuple[List[ToolDefinition], Dict[str, str]]:
         # Determine which tools to include
         tool_groups_to_include = toolgroups_for_turn or self.agent_config.toolgroups or []
         agent_config_toolgroups = []
@@ -747,8 +747,10 @@ class ChatAgent(ShieldRunnerMixin):
             if name not in agent_config_toolgroups:
                 agent_config_toolgroups.append(name)
 
+        toolgroup_to_args = toolgroup_to_args or {}
+
         tool_name_to_def = {}
-        tool_name_to_group_id = {}
+        tool_name_to_args = {}
 
         for tool_def in self.agent_config.client_tools:
             if tool_name_to_def.get(tool_def.name, None):
@@ -766,53 +768,38 @@ class ChatAgent(ShieldRunnerMixin):
                     for param in tool_def.parameters
                 },
             )
-            tool_name_to_group_id[tool_def.name] = "__client_tools__"
         for toolgroup_name_with_maybe_tool_name in agent_config_toolgroups:
-            toolgroup_name, tool_name = self._parse_toolgroup_name(toolgroup_name_with_maybe_tool_name)
+            toolgroup_name, input_tool_name = self._parse_toolgroup_name(toolgroup_name_with_maybe_tool_name)
             tools = await self.tool_groups_api.list_tools(toolgroup_id=toolgroup_name)
             if not tools.data:
                 available_tool_groups = ", ".join(
                     [t.identifier for t in (await self.tool_groups_api.list_tool_groups()).data]
                 )
                 raise ValueError(f"Toolgroup {toolgroup_name} not found, available toolgroups: {available_tool_groups}")
-            if tool_name is not None and not any(tool.identifier == tool_name for tool in tools.data):
+            if input_tool_name is not None and not any(tool.identifier == input_tool_name for tool in tools.data):
                 raise ValueError(
-                    f"Tool {tool_name} not found in toolgroup {toolgroup_name}. Available tools: {', '.join([tool.identifier for tool in tools.data])}"
+                    f"Tool {input_tool_name} not found in toolgroup {toolgroup_name}. Available tools: {', '.join([tool.identifier for tool in tools.data])}"
                 )
 
             for tool_def in tools.data:
                 if toolgroup_name.startswith("builtin") and toolgroup_name != RAG_TOOL_GROUP:
-                    tool_name = tool_def.identifier
-                    built_in_type = BuiltinTool.brave_search
-                    if tool_name == "web_search":
-                        built_in_type = BuiltinTool.brave_search
+                    identifier: str | BuiltinTool | None = tool_def.identifier
+                    if identifier == "web_search":
+                        identifier = BuiltinTool.brave_search
                     else:
-                        built_in_type = BuiltinTool(tool_name)
+                        identifier = BuiltinTool(identifier)
+                else:
+                    # add if tool_name is unspecified or the tool_def identifier is the same as the tool_name
+                    if input_tool_name in (None, tool_def.identifier):
+                        identifier = tool_def.identifier
+                    else:
+                        identifier = None
 
-                    if tool_name_to_def.get(built_in_type, None):
-                        raise ValueError(f"Tool {built_in_type} already exists")
-
-                    tool_name_to_def[built_in_type] = ToolDefinition(
-                        tool_name=built_in_type,
-                        description=tool_def.description,
-                        parameters={
-                            param.name: ToolParamDefinition(
-                                param_type=param.parameter_type,
-                                description=param.description,
-                                required=param.required,
-                                default=param.default,
-                            )
-                            for param in tool_def.parameters
-                        },
-                    )
-                    tool_name_to_group_id[built_in_type] = tool_def.toolgroup_id
-                    continue
-
-                if tool_name_to_def.get(tool_def.identifier, None):
-                    raise ValueError(f"Tool {tool_def.identifier} already exists")
-                if tool_name in (None, tool_def.identifier):
+                if tool_name_to_def.get(identifier, None):
+                    raise ValueError(f"Tool {identifier} already exists")
+                if identifier:
                     tool_name_to_def[tool_def.identifier] = ToolDefinition(
-                        tool_name=tool_def.identifier,
+                        tool_name=identifier,
                         description=tool_def.description,
                         parameters={
                             param.name: ToolParamDefinition(
@@ -824,9 +811,9 @@ class ChatAgent(ShieldRunnerMixin):
                             for param in tool_def.parameters
                         },
                     )
-                    tool_name_to_group_id[tool_def.identifier] = tool_def.toolgroup_id
+                    tool_name_to_args[tool_def.identifier] = toolgroup_to_args.get(toolgroup_name, {})
 
-        return list(tool_name_to_def.values()), tool_name_to_group_id
+        self.tool_defs, self.tool_name_to_args = list(tool_name_to_def.values()), tool_name_to_args
 
     def _parse_toolgroup_name(self, toolgroup_name_with_maybe_tool_name: str) -> tuple[str, Optional[str]]:
         """Parse a toolgroup name into its components.
@@ -850,29 +837,31 @@ class ChatAgent(ShieldRunnerMixin):
         session_id: str,
         tool_call: ToolCall,
     ) -> ToolInvocationResult:
-        name = tool_call.tool_name
-        group_name = self.tool_name_to_group_id.get(name, None)
-        if group_name is None:
+        tool_name = tool_call.tool_name
+        registered_tool_names = [tool_def.tool_name for tool_def in self.tool_defs]
+        if tool_name not in registered_tool_names:
             raise ValueError(
-                f"Tool {name} not found in any tool group, available tools: {', '.join(self.tool_name_to_group_id.keys())}"
+                f"Tool {tool_name} not found in provided tools, registered tools: {', '.join([str(x) for x in registered_tool_names])}"
             )
-        if isinstance(name, BuiltinTool):
-            if name == BuiltinTool.brave_search:
-                name = WEB_SEARCH_TOOL
+        if isinstance(tool_name, BuiltinTool):
+            if tool_name == BuiltinTool.brave_search:
+                tool_name_str = WEB_SEARCH_TOOL
             else:
-                name = name.value
+                tool_name_str = tool_name.value
+        else:
+            tool_name_str = tool_name
 
-        logger.info(f"executing tool call: {name} with args: {tool_call.arguments}")
+        logger.info(f"executing tool call: {tool_name_str} with args: {tool_call.arguments}")
         result = await self.tool_runtime_api.invoke_tool(
-            tool_name=name,
+            tool_name=tool_name_str,
             kwargs={
                 "session_id": session_id,
                 # get the arguments generated by the model and augment with toolgroup arg overrides for the agent
                 **tool_call.arguments,
-                **self.toolgroup_to_args.get(group_name, {}),
+                **self.tool_name_to_args.get(tool_name_str, {}),
             },
         )
-        logger.debug(f"tool call {name} completed with result: {result}")
+        logger.debug(f"tool call {tool_name_str} completed with result: {result}")
         return result
 
     async def handle_documents(

From 2baf200b63e2677ec991adc5e77c19d46477d838 Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <31703736+nathan-weinberg@users.noreply.github.com>
Date: Wed, 12 Mar 2025 22:05:49 -0400
Subject: [PATCH 07/38] ci: add html report to unit test artifacts (#1576)

# What does this PR do?
additional artifacts make test results more human-readable

## Test Plan
Ran locally

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .github/workflows/unit-tests.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 59d18b3be..517b5c39a 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -33,7 +33,7 @@ jobs:
 
       - name: Run unit tests
         run: |
-          uv run --python ${{ matrix.python }} --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest --cov=llama_stack -s -v tests/unit/ --junitxml=pytest-report-${{ matrix.python }}.xml
+          uv run --python ${{ matrix.python }} --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest --cov=llama_stack -s -v tests/unit/ --junitxml=pytest-report-${{ matrix.python }}.xml --cov-report=html:htmlcov-${{ matrix.python }}
 
       - name: Upload test results
         if: always()
@@ -43,4 +43,5 @@ jobs:
           path: |
             .pytest_cache/
             pytest-report-${{ matrix.python }}.xml
+            htmlcov-${{ matrix.python }}/
           retention-days: 7

From 6bfcb65343091e53c2d9cff0b8f41862279d4bbe Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Wed, 12 Mar 2025 19:21:53 -0700
Subject: [PATCH 08/38] test: code exec on mac (#1549)

Summary:
1. adds option to not use bwrap for code execution
2. disable bwrap when running tests on macs

Test Plan:
```
LLAMA_STACK_CONFIG=fireworks pytest -s -v tests/integration/agents/test_agents.py --safety-shield meta-llama/Llama-Guard-3-8B --text-model meta-llama/Llama-3.1-8B-Instruct
```

Verify code_interpreter result in logs

INFO 2025-03-11 08:10:39,858
llama_stack.providers.inline.agents.meta_reference.agent_instance:1032
agents: tool
call code_interpreter completed with result:
content='completed\n\n541\n' error_message=None error_code=None
         metadata=None
---
 .../code_interpreter/code_execution.py        |   10 +-
 .../code_interpreter/code_interpreter.py      |    5 +-
 tests/integration/agents/test_agents.py       |    2 +-
 tests/integration/conftest.py                 |    9 +
 .../recorded_responses/chat_completion.json   | 6066 ++++++++++++++---
 .../recorded_responses/invoke_tool.json       |   74 +-
 6 files changed, 5260 insertions(+), 906 deletions(-)

diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
index 6f4b25b9d..d7b2dbdef 100644
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
@@ -76,6 +76,7 @@ class CodeExecutionRequest:
     only_last_cell_fail: bool = True
     seed: int = 0
     strip_fpaths_in_stderr: bool = True
+    use_bwrap: bool = True
 
 
 class CodeExecutor:
@@ -103,8 +104,6 @@ _set_seeds()\
 
         script = "\n\n".join([seeds_prefix] + [CODE_ENV_PREFIX] + scripts)
         with tempfile.TemporaryDirectory() as dpath:
-            bwrap_prefix = "bwrap " + generate_bwrap_command(bind_dirs=[dpath])
-            cmd = [*bwrap_prefix.split(), sys.executable, "-c", script]
             code_fpath = os.path.join(dpath, "code.py")
             with open(code_fpath, "w") as f:
                 f.write(script)
@@ -118,6 +117,13 @@ _set_seeds()\
                     MPLBACKEND="module://matplotlib_custom_backend",
                     PYTHONPATH=f"{DIRNAME}:{python_path}",
                 )
+
+                if req.use_bwrap:
+                    bwrap_prefix = "bwrap " + generate_bwrap_command(bind_dirs=[dpath])
+                    cmd = [*bwrap_prefix.split(), sys.executable, "-c", script]
+                else:
+                    cmd = [sys.executable, "-c", script]
+
                 stdout, stderr, returncode = do_subprocess(
                     cmd=cmd,
                     env=env,
diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
index 54f17f9a2..4b97914c5 100644
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py
@@ -6,6 +6,7 @@
 
 
 import logging
+import os
 import tempfile
 from typing import Any, Dict, List, Optional
 
@@ -61,7 +62,9 @@ class CodeInterpreterToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime):
 
     async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
         script = kwargs["code"]
-        req = CodeExecutionRequest(scripts=[script])
+        # Use environment variable to control bwrap usage
+        force_disable_bwrap = os.environ.get("DISABLE_CODE_SANDBOX", "").lower() in ("1", "true", "yes")
+        req = CodeExecutionRequest(scripts=[script], use_bwrap=not force_disable_bwrap)
         res = self.code_executor.execute(req)
         pieces = [res["process_status"]]
         for out_type in ["stdout", "stderr"]:
diff --git a/tests/integration/agents/test_agents.py b/tests/integration/agents/test_agents.py
index a542e5403..f6bde8927 100644
--- a/tests/integration/agents/test_agents.py
+++ b/tests/integration/agents/test_agents.py
@@ -187,7 +187,7 @@ def test_builtin_tool_web_search(llama_stack_client_with_mocked_inference, agent
         messages=[
             {
                 "role": "user",
-                "content": "Search the web and tell me who the current CEO of Meta is.",
+                "content": "Search the web and tell me who the founder of Meta is.",
             }
         ],
         session_id=session_id,
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index f4fe9e8ff..bf1092c4a 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -6,12 +6,17 @@
 import inspect
 import itertools
 import os
+import platform
 import textwrap
 
 from dotenv import load_dotenv
 
+from llama_stack.log import get_logger
+
 from .report import Report
 
+logger = get_logger(__name__, category="tests")
+
 
 def pytest_configure(config):
     config.option.tbstyle = "short"
@@ -24,6 +29,10 @@ def pytest_configure(config):
         key, value = env_var.split("=", 1)
         os.environ[key] = value
 
+    if platform.system() == "Darwin":  # Darwin is the system name for macOS
+        os.environ["DISABLE_CODE_SANDBOX"] = "1"
+        logger.info("Setting DISABLE_CODE_SANDBOX=1 for macOS")
+
     if config.getoption("--report"):
         config.pluginmanager.register(Report(config))
 
diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.json b/tests/integration/fixtures/recorded_responses/chat_completion.json
index 7234b6c31..30b7e0b4d 100644
--- a/tests/integration/fixtures/recorded_responses/chat_completion.json
+++ b/tests/integration/fixtures/recorded_responses/chat_completion.json
@@ -12535,7 +12535,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 139
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 23
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 162
+            }
+          ]
         }
       }
     ],
@@ -12589,7 +12605,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "type\": \"function\", \"name\":",
+              "text": "type\": \"function\", \"name\": \"",
               "type": "text"
             },
             "event_type": {
@@ -12609,7 +12625,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " \"get_boiling_point\", \"parameters\":",
+              "text": "get_boiling_point\", \"parameters\": {\"",
               "type": "text"
             },
             "event_type": {
@@ -12629,7 +12645,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " {\"liquid_name\": \"polyjuice",
+              "text": "liquid_name\": \"polyjuice\", \"celcius\":",
               "type": "text"
             },
             "event_type": {
@@ -12649,7 +12665,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "\", \"celcius\": \"false\"}}",
+              "text": " \"false\"}}",
               "type": "text"
             },
             "event_type": {
@@ -12679,7 +12695,7 @@
                   "celcius": "false",
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "bffe07d7-343f-49c4-bcff-d83c99fa7d4a",
+                "call_id": "fc7e2525-3e7b-47ff-8731-12dd7655dfd6",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -12720,7 +12736,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 91
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 45
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 136
+            }
+          ]
         }
       }
     ],
@@ -12774,7 +12806,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "    \"type\": \"function\",\n    \"name\": \"get",
+              "text": "    \"type\": \"function\",\n    \"",
               "type": "text"
             },
             "event_type": {
@@ -12794,7 +12826,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "_boiling_point\",\n    \"parameters\": {\n        \"liquid_name",
+              "text": "name\": \"get_boiling_point\",\n    \"parameters\":",
               "type": "text"
             },
             "event_type": {
@@ -12814,7 +12846,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "\": \"polyjuice\",\n       ",
+              "text": " {\n        \"liquid_name\": \"polyjuice\",\n        \"celci",
               "type": "text"
             },
             "event_type": {
@@ -12834,7 +12866,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " \"celcius\": \"true\"\n    }\n}",
+              "text": "us\": \"true\"\n    }\n}",
               "type": "text"
             },
             "event_type": {
@@ -12864,7 +12896,7 @@
                   "celcius": "true",
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "41ce6bfb-81c1-438d-8520-329c4446f1bc",
+                "call_id": "73212def-09c0-4a29-845e-149afb38fcd1",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -12905,7 +12937,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 43
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 55
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 98
+            }
+          ]
         }
       }
     ],
@@ -12959,7 +13007,27 @@
         "data": {
           "event": {
             "delta": {
-              "text": " boiling point of polyjuice is -100\u00b0C.",
+              "text": " boiling point of polyjuice is",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " -100\u00b0C.",
               "type": "text"
             },
             "event_type": {
@@ -12994,7 +13062,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 85
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 22
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 107
+            }
+          ]
         }
       }
     ],
@@ -13048,7 +13132,27 @@
         "data": {
           "event": {
             "delta": {
-              "text": " boiling point of polyjuice is -100\u00b0C.",
+              "text": " boiling point of polyjuice is -100",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\u00b0C.",
               "type": "text"
             },
             "event_type": {
@@ -13083,13 +13187,230 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 87
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 22
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 109
+            }
+          ]
         }
       }
     ],
     "type": "generator"
   },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\": \"get_boiling_point\", \"parameters\": {\"liquid_name\": \"poly",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "juice\", \"celcius\": \"true\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "celcius": "true",
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "e4b0121a-7b75-4e89-be40-d13021a3bb11",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 37
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 47
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": {
     "chunks": [
       {
         "__module__": "llama_stack.apis.inference.inference",
@@ -13172,7 +13493,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\", \"parameters\": {\"liquid_name\": \"polyjuice\", \"cel",
+              "tool_call": "_with_metadata\", \"parameters\": {\"liquid_name\": \"polyjuice\",",
               "type": "tool_call"
             },
             "event_type": {
@@ -13197,7 +13518,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "cius\": \"true\"}}",
+              "tool_call": " \"celcius\": \"true\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -13227,192 +13548,7 @@
                   "celcius": "true",
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "6161b956-9b68-4e88-87bf-e26a07d4c7ca",
-                "tool_name": "get_boiling_point"
-              },
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": {
-              "__enum__": "StopReason",
-              "__module__": "llama_stack.models.llama.datatypes",
-              "value": "end_of_turn"
-            }
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "complete"
-            },
-            "logprobs": null,
-            "stop_reason": {
-              "__enum__": "StopReason",
-              "__module__": "llama_stack.models.llama.datatypes",
-              "value": "end_of_turn"
-            }
-          },
-          "metrics": null
-        }
-      }
-    ],
-    "type": "generator"
-  },
-  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": {
-    "chunks": [
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "start"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "started"
-              },
-              "tool_call": "",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "_point_with_metadata\", \"parameters\": {\"liquid_name\": \"polyju",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "ice\", \"celcius\": \"true\"}}",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "succeeded"
-              },
-              "tool_call": {
-                "arguments": {
-                  "celcius": "true",
-                  "liquid_name": "polyjuice"
-                },
-                "call_id": "11da4a37-d7ad-468a-98c8-0f1e295d14a9",
+                "call_id": "11c04896-2b7b-49bd-b832-47a1c9f3796f",
                 "tool_name": "get_boiling_point_with_metadata"
               },
               "type": "tool_call"
@@ -13453,7 +13589,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 37
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 47
+            }
+          ]
         }
       }
     ],
@@ -13507,7 +13659,27 @@
         "data": {
           "event": {
             "delta": {
-              "text": " customer smiled and said \"hello\" to the friendly store clerk.",
+              "text": " customer smiled and said \"hello\" to the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " friendly store clerk.",
               "type": "text"
             },
             "event_type": {
@@ -13542,7 +13714,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 30
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 24
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 54
+            }
+          ]
         }
       }
     ],
@@ -13957,6 +14145,1165 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\nprint(df.head())\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stdout]\\nYear  Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec\\n0  2014  1.6  1.6  1.7  1.8  2.0  1.9  1.9  1.7  1.7  1.8  1.7  1.6\\n1  2015  1.6  1.7  1.8  1.8  1.7  1.8  1.8  1.8  1.9  1.9  2.0  2.1\\n2  2016  2.2  2.3  2.2  2.1  2.2  2.2  2.2  2.3  2.2  2.1  2.1  2.2\\n3  2017  2.3  2.2  2.0  1.9  1.7  1.7  1.7  1.7  1.7  1.8  1.7  1.8\\n4  2018  1.8  1.8  2.1  2.1  2.2  2.3  2.4  2.2  2.2  2.1  2.2  2.2\\n[/stdout]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " csv file contains data on inflation rates for each month of the year from",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " 2014 to 2018. The columns are:\n\n- Year",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ": The year of the inflation rate\n-",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Jan to Dec: The inflation rate for each month of",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the year\n\nThe inflation rates are all in the range of ",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "1.6 to 2.4, indicating",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " a relatively stable inflation rate over the years.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 471
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 91
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 562
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 142, in <module>\\n  line 23, in <module>\\n    from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\\nImportError: attempted relative import with no known parent package\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 142, in <module>\\n  line 23, in <module>\\n    from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\\nImportError: attempted relative import with no known parent package\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " error message indicates that there is an issue with the import statement. However",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ", the code provided does not contain any import statements that",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " would cause this error.\n\nTo provide a more accurate answer, I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " would need to know the contents of the CSV file or more information",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " about the error message.\n\nHowever, based on the code provided, it",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " seems like the code is trying to load a CSV",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " file and print some basic information about it. If the file is",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " not found or there is an issue with the file",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " path, this could cause an error.\n\nHere is a revised version",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " of the code that includes some error handling:\n\n``",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "`\nimport pandas as pd\nimport code_interpreter\n\ntry:\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "    # Load the CSV file\n    df = pd.read_csv(\"/",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "var/folders/cz/vyh7y1d11x",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "g881lsxsshnc5c0000gn/T/tmp",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "_d_cdeif/6TpkUAo",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "0inflation.csv\")\n\n    # Print the first few rows of the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " dataframe\n    print(df.head())\n\n    # Print the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " data types of each column\n    print(df",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".dtypes)\n\n    # Print the summary statistics of the dataframe\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "    print(df.describe())\n\nexcept FileNotFoundError:\n    print(\"The file was",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " not found.\")\nexcept pd.errors.EmptyDataError:\n    print(\"The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " file is empty.\")\nexcept pd.errors.ParserError:\n    print(\"An",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " error occurred while parsing the file.\")\nexcept Exception as e:\n    print",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "(\"An error occurred: \", str(e))\n```\n\nThis code will",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " catch specific exceptions that could occur when loading the CSV file and print a",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " more informative error message.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 391
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 330
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 721
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 142, in <module>\\n  line 23, in <module>\\n    from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\\nImportError: attempted relative import with no known parent package\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "import pandas as pd\nimport code_interpreter\n\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "# Load the CSV file\ndf = pd.read_csv(\"/var",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "/folders/cz/vyh7y1d11xg881",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "lsxsshnc5c0000gn/T/tmp_d_cdeif",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "/6TpkUAo0inflation.csv\")\n\n# Print the",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " first few rows of the dataframe\nprint(df.head())\n\n# Print the",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " data types of each column\nprint(df.dtypes)\n\n# Print the",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " summary statistics of the dataframe\nprint(df.describe())",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp_d_cdeif/6TpkUAo0inflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())"
+                },
+                "call_id": "fa1b393f-3fc7-416f-98ab-05d879def880",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "code_interpreter"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 214
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 224
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
     "chunks": [
       {
@@ -15046,7 +16393,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "import pandas as pd\n",
+              "tool_call": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/c",
               "type": "tool_call"
             },
             "event_type": {
@@ -15071,7 +16418,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "import code_interpreter\n\n# Load the CSV file\ndf =",
+              "tool_call": "z/vyh7y1d11xg881lsxsshnc5",
               "type": "tool_call"
             },
             "event_type": {
@@ -15096,7 +16443,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " pd.read_csv(\"/var/folders",
+              "tool_call": "c0000gn/T/tmpe8u6r9sz/R",
               "type": "tool_call"
             },
             "event_type": {
@@ -15121,157 +16468,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "/cz/vyh7y1d11xg881",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "lsxsshnc5c0000gn/T/tmp4ed7",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "p2bg/Csr659svinflation.csv\")\n\n# Print",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": " the first few rows of the dataframe\nprint(df.head())\n\n#",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": " Print the data types of each column\nprint(df.dtypes)\n\n",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "# Print the summary statistics of the dataframe",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "\nprint(df.describe())",
+              "tool_call": "ChoI8s0inflation.csv\")\nprint(df.head())",
               "type": "tool_call"
             },
             "event_type": {
@@ -15298,9 +16495,9 @@
               },
               "tool_call": {
                 "arguments": {
-                  "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp4ed7p2bg/Csr659svinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())"
+                  "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpe8u6r9sz/RChoI8s0inflation.csv\")\nprint(df.head())"
                 },
-                "call_id": "8aeab20b-341b-4349-84dc-3e3c3299d713",
+                "call_id": "35e85870-f8f3-44f4-8879-e7b02a2805f6",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -15345,7 +16542,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 37
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 47
+            }
+          ]
         }
       }
     ],
@@ -17611,6 +18824,670 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n       'Oct', 'Nov', 'Dec'],\\n      dtype='object')\\nDatatype of the columns are: Year      int64\\nJan     float64\\nFeb     float64\\nMar     float64\\nApr     float64\\nMay     float64\\nJun     float64\\nJul     float64\\nAug     float64\\nSep     float64\\nOct     float64\\nNov     float64\\nDec     float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"This CSV file contains 10 rows and 13 columns. The columns are named 'Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'. The data types of these columns are int64 for 'Year' and float64 for the rest.\\n\\nIt appears that this CSV file contains monthly inflation rates for different years. The 'Year' column represents the year, and the rest of the columns represent the inflation rates for each month of the year.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n\\n# Calculate average yearly inflation\\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\\n\\n# Plot time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df['Year'], df['Average'])\\nplt.xlabel('Year')\\nplt.ylabel('Average Yearly Inflation')\\nplt.title('Average Yearly Inflation Over Time')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "This",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " code will create a line plot of the average",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " yearly inflation over time. The x-axis represents",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the year, and the y-axis represents the average",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " yearly inflation. The plot will show the trend of average yearly inflation",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " over the years.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 633
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 56
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 689
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n       'Oct', 'Nov', 'Dec'],\\n      dtype='object')\\nDatatype of the columns are: Year      int64\\nJan     float64\\nFeb     float64\\nMar     float64\\nApr     float64\\nMay     float64\\nJun     float64\\nJul     float64\\nAug     float64\\nSep     float64\\nOct     float64\\nNov     float64\\nDec     float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"This CSV file contains 10 rows and 13 columns. The columns are named 'Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'. The data types of these columns are int64 for 'Year' and float64 for the rest.\\n\\nIt appears that this CSV file contains monthly inflation rates for different years. The 'Year' column represents the year, and the rest of the columns represent the inflation rates for each month of the year.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "import pandas as pd\nimport matplotlib.pyplot",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " as plt\n\n# Load data\ndf = pd.read_csv(\"/",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "var/folders/cz/vyh7y1d11",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "xg881lsxsshnc5c0000gn/T",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "/tmp_d_cdeif/UuctHlJzinflation.csv\")\n\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "# Calculate average yearly inflation\ndf['Average",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "'] = df[['Jan', 'Feb', 'Mar',",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " 'Apr', 'May', 'Jun',",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " 'Jul', 'Aug', 'Sep', 'Oct', '",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "Nov', 'Dec']].mean(axis=1)\n\n# Plot time series",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\nplt.figure(figsize=(10,6))\nplt.plot(df['Year",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "'], df['Average'])\nplt.xlabel('Year')\nplt.ylabel('Average",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " Yearly Inflation')\nplt.title('Average Yearly Inflation Over",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " Time')\nplt.grid(True)\nplt.show()",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp_d_cdeif/UuctHlJzinflation.csv\")\n\n# Calculate average yearly inflation\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\n\n# Plot time series\nplt.figure(figsize=(10,6))\nplt.plot(df['Year'], df['Average'])\nplt.xlabel('Year')\nplt.ylabel('Average Yearly Inflation')\nplt.title('Average Yearly Inflation Over Time')\nplt.grid(True)\nplt.show()"
+                },
+                "call_id": "f953fd92-9413-4968-9ffa-f85ddea173dc",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "code_interpreter"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 453
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 463
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
     "chunks": [
       {
@@ -18040,6 +19917,291 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n       'Oct', 'Nov', 'Dec'],\\n      dtype='object')\\nDatatype of the columns are: Year      int64\\nJan     float64\\nFeb     float64\\nMar     float64\\nApr     float64\\nMay     float64\\nJun     float64\\nJul     float64\\nAug     float64\\nSep     float64\\nOct     float64\\nNov     float64\\nDec     float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "This",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " CSV file contains 10 rows and 13 columns. The columns",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " are named 'Year', 'Jan', 'Feb', 'Mar',",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " 'Apr', 'May', 'Jun', 'Jul', 'Aug",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "', 'Sep', 'Oct', 'Nov', '",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "Dec'. The data types of these columns are int64 for '",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "Year' and float64 for the rest.\n\nIt appears that this CSV",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " file contains monthly inflation rates for different years. The 'Year' column",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " represents the year, and the rest of the columns represent the inflation rates",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " for each month of the year",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 326
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 125
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 451
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
     "chunks": [
       {
@@ -18098,7 +20260,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "import pandas as pd\n# Load data\ndf = pd.read",
+              "tool_call": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var",
               "type": "tool_call"
             },
             "event_type": {
@@ -18123,7 +20285,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "_csv(\"/var/folders/cz/vyh7y1d",
+              "tool_call": "/folders/cz/vyh7y1d11xg881lsx",
               "type": "tool_call"
             },
             "event_type": {
@@ -18148,7 +20310,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "11xg881lsxsshnc",
+              "tool_call": "sshnc5c0000gn/T/tmp_d_cdeif/Uuct",
               "type": "tool_call"
             },
             "event_type": {
@@ -18173,7 +20335,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "5c0000gn/T/tmp4ed7p2bg/U",
+              "tool_call": "HlJzinflation.csv\")\n# Rows\nprint(\"Number of rows",
               "type": "tool_call"
             },
             "event_type": {
@@ -18198,7 +20360,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "Z0Z335vinflation.csv\")\n# Rows\nprint(\"",
+              "tool_call": " and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns",
               "type": "tool_call"
             },
             "event_type": {
@@ -18223,7 +20385,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "Number of rows and columns in the data:\", df.shape)\n# Columns",
+              "tool_call": " of the data are:\", len(df.columns))\n# Column names\nprint(\"",
               "type": "tool_call"
             },
             "event_type": {
@@ -18248,7 +20410,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\nprint(\"Columns of the data are:\", len(df.columns))\n# Column",
+              "tool_call": "Columns of the data are:\", df.columns)\n# Column dtypes\nprint",
               "type": "tool_call"
             },
             "event_type": {
@@ -18273,57 +20435,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " names\nprint(\"Columns of the data are:\", df.columns)\n# Column",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": " dtypes\nprint(\"Datatype of the columns are:\", df.dtypes",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": ")",
+              "tool_call": "(\"Datatype of the columns are:\", df.dtypes)",
               "type": "tool_call"
             },
             "event_type": {
@@ -18350,9 +20462,9 @@
               },
               "tool_call": {
                 "arguments": {
-                  "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp4ed7p2bg/UZ0Z335vinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)"
+                  "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp_d_cdeif/UuctHlJzinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)"
                 },
-                "call_id": "98e27ff4-d4d7-4764-9213-f46bb928ec68",
+                "call_id": "479e0208-711f-4318-b284-745599a9fb9c",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -18397,8 +20509,1524 @@
               "value": "end_of_turn"
             }
           },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 36
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 46
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:02bc2\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:e40e6\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:200a9\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:e40e6\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:200a9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:e40e6\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:e40e6\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:e40e6\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:e40e6\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:e40e6\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
           "metrics": null
         }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "To",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use LoRA in Torchtune, you",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " can follow these steps:\n\n1.  Install Torchtune",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " and its dependencies.\n2.  Download the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Llama2 weights and tokenizer.\n3.  Use the `",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "lora_llama2_7b` model in Torcht",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "une, which applies LoRA to the Q and V projections by",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " default.\n4.  Load the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " base model weights into the LoRA model without",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " any conversion necessary.\n5.  Set only LoRA parameters",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " to trainable.\n6.  Run the LoRA",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " finetuning recipe in Torchtune with the desired configuration.\n\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "You can also experiment with different LoRA configurations, such as applying Lo",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "RA to all linear layers in the self-attention",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ", increasing the rank, or scaling alpha and rank together.\n\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "Note that LoRA can be beneficial for",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " reducing memory usage during fine-tuning, but it may also",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " impact model performance. You can trade off memory and model performance by",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " adjusting the LoRA configuration and running experiments with different settings.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 158
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 212
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 370
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:02bc2\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:e40e6\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:200a9\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:e40e6\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:200a9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "knowledge_search\", \"parameters\": {\"query\": \"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "How to use LoRA in Torchtune\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "How to use LoRA in Torchtune"
+                },
+                "call_id": "6ee142d9-1a65-433e-a681-f20066a2e1f7",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 117
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 40
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 157
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:02bc2\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:e40e6\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:200a9\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:e40e6\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:200a9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'m ready to help you answer questions about Torcht",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "une based on the documentation you provided.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " What's your first question?",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 75
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 35
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 110
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:16a6a\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:cc255\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:7a06a\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:cc255\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:7a06a\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:cc255\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:cc255\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:cc255\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:cc255\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:cc255\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "To",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use LoRA in Torchtune, you",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " can follow these steps:\n\n1. ",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Install Torchtune and its dependencies.\n2.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "  Download the Llama2 weights and tokenizer",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".\n3.  Use the `lora_llama2",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "_7b` model in Torchtune, which applies LoRA",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " to the Q and V projections by default.\n4.  Set",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the `lora_attn_modules` argument to apply LoRA to",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " all linear layers in the self-attention.\n5. ",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Increase the rank and alpha values to experiment with different LoRA configurations.\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "6.  Run the LoRA finetuning",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " recipe in Torchtune using the `lora_finetune",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "_distributed` command.\n7.  Monitor the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " loss curves and adjust the LoRA configuration as needed to trade off",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " memory and model performance.\n\nBy following these steps, you can effectively use",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " LoRA in Torchtune to fine-tune",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Llama2 models with a low memory footprint.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 158
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 206
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 364
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:16a6a\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:cc255\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:7a06a\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:cc255\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:7a06a\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\": {\"query\": \"How to use LoRA in Torchtune",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "How to use LoRA in Torchtune"
+                },
+                "call_id": "a7b02498-0a50-40c2-abf2-563d4d26d01f",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 117
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 40
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 157
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:16a6a\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:cc255\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:7a06a\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:cc255\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:7a06a\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'m ready to help you answer questions about Torchtune based on the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " documentation you provided. What's your first question?",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 75
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 35
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 110
+            }
+          ]
+        }
       }
     ],
     "type": "generator"
@@ -23085,7 +26713,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
+              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",",
               "type": "tool_call"
             },
             "event_type": {
@@ -23110,7 +26738,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\": {\"query\": \"Torchtune documentation\"}}",
+              "tool_call": " \"parameters\": {\"query\": \"Torchtune documentation\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -23139,7 +26767,7 @@
                 "arguments": {
                   "query": "Torchtune documentation"
                 },
-                "call_id": "5cfa4683-2147-41ab-9a44-a8b7f23e9f75",
+                "call_id": "8b279fd4-0479-48d4-8ef7-882004f994b2",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -23180,7 +26808,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 39
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 49
+            }
+          ]
         }
       }
     ],
@@ -23234,7 +26878,27 @@
         "data": {
           "event": {
             "delta": {
-              "text": " attention type used by Llama3-8B is grouped-query attention.",
+              "text": " attention type used by Llama3-8B is grouped-query attention",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".",
               "type": "text"
             },
             "event_type": {
@@ -23269,7 +26933,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 80
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 26
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 106
+            }
+          ]
         }
       }
     ],
@@ -23323,7 +27003,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " attention type used by Llama3",
+              "text": " attention type used by Llama3-8B is grouped",
               "type": "text"
             },
             "event_type": {
@@ -23343,7 +27023,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "-8B is grouped-query attention.",
+              "text": "-query attention.",
               "type": "text"
             },
             "event_type": {
@@ -23378,7 +27058,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 80
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 26
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 106
+            }
+          ]
         }
       }
     ],
@@ -23432,7 +27128,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "    \"type\": \"function\",\n   ",
+              "text": "    \"type\": \"function\",\n    \"name\": \"knowledge",
               "type": "text"
             },
             "event_type": {
@@ -23452,27 +27148,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " \"name\": \"knowledge_search\",\n   ",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " \"parameters\": {\n        \"query",
+              "text": "_search\",\n    \"parameters\": {\n        \"query",
               "type": "text"
             },
             "event_type": {
@@ -23521,7 +27197,7 @@
                 "arguments": {
                   "query": "Llama3-8B attention type"
                 },
-                "call_id": "b2d62231-df92-43ed-b51f-f7b8a4bc4b15",
+                "call_id": "34e18a7e-5c52-403b-9dd3-cdb3f7f8cb89",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -23562,7 +27238,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 40
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 48
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 88
+            }
+          ]
         }
       }
     ],
@@ -23626,7 +27318,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\":",
+              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
               "type": "tool_call"
             },
             "event_type": {
@@ -23651,32 +27343,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " \"knowledge_search\", \"parameters\": {\"query\": \"Llama",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "3-8B attention type\"}}",
+              "tool_call": "\": {\"query\": \"Llama3-8B attention type\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -23705,7 +27372,7 @@
                 "arguments": {
                   "query": "Llama3-8B attention type"
                 },
-                "call_id": "52c2b1ea-3695-4030-87a1-d0ca6d1056af",
+                "call_id": "2c25de18-5466-4f97-bef1-b463abbbcd55",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -23746,7 +27413,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 40
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 50
+            }
+          ]
         }
       }
     ],
@@ -24269,7 +27952,596 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the founder of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Meta founder\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"{\\\"query\\\": \\\"Meta founder\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.81595254, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.70726365, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\\\", \\\"score\\\": 0.467308, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta Platforms - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Meta_Platforms\\\", \\\"content\\\": \\\"Following a period of intense scrutiny and damaging whistleblower leaks, news started to emerge on October 21, 2021, about Facebook's plan to rebrand the company and change its name.[15][54] In the Q3 2021 Earnings Call on October 25, Mark Zuckerberg discussed the ongoing criticism of the company's social services and the way it operates, and pointed to the pivoting efforts to building the metaverse \\\\u2013 without mentioning the rebranding and the name change.[55] The metaverse vision and the name change from Facebook, Inc. to Meta Platforms was introduced at Facebook Connect on October 28, 2021.[16] Based on Facebook's PR campaign, the name change reflects the company's shifting long term focus of building the metaverse, a digital extension of the physical world by social media, virtual reality and augmented reality features.[16][56]\\\", \\\"score\\\": 0.14999175, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.03678684, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " founder of Meta is Mark Zuckerberg.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 1220
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 18
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 1238
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the founder of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "brave_search.call(query=\"Meta founder\")",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "Meta founder"
+                },
+                "call_id": "3918bbe3-28b2-411d-be4a-f0eb2aa137e8",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "brave_search"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 33
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 43
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " function `get_boiling_point` is",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " not able to find the boiling point of poly",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "juice as it is a fictional liquid from the Harry Potter series",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ". The function is only able to find the boiling point of real",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " liquids.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 70
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 56
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 126
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " function `get_boiling_point",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "` is not able to find the boiling point of polyjuice",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " as it is not a real liquid.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 70
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 38
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 108
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
         "__module__": "llama_stack.apis.inference.inference",
@@ -24337,7 +28609,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " find the boiling point of polyjuice as",
+              "text": " find the boiling point of polyju",
               "type": "text"
             },
             "event_type": {
@@ -24357,27 +28629,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " it is a fictional liquid from the Harry Potter series. The",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " function is only able to find the boiling point of real liquids.",
+              "text": "ice as it is not a real liquid.",
               "type": "text"
             },
             "event_type": {
@@ -24412,265 +28664,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
-        }
-      }
-    ],
-    "type": "generator"
-  },
-  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
-    "chunks": [
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "",
-              "type": "text"
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 70
             },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "start"
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 38
             },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "The",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " function `get_boiling_point` is not able to find the",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " boiling point of polyjuice as it is not a real liquid",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": ".",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "complete"
-            },
-            "logprobs": null,
-            "stop_reason": {
-              "__enum__": "StopReason",
-              "__module__": "llama_stack.models.llama.datatypes",
-              "value": "end_of_turn"
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 108
             }
-          },
-          "metrics": null
-        }
-      }
-    ],
-    "type": "generator"
-  },
-  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
-    "chunks": [
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "start"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "The",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " function `get_boiling_point` is not able to find the",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " boiling point of polyjuice as it is not a real liquid",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": ".",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "complete"
-            },
-            "logprobs": null,
-            "stop_reason": {
-              "__enum__": "StopReason",
-              "__module__": "llama_stack.models.llama.datatypes",
-              "value": "end_of_turn"
-            }
-          },
-          "metrics": null
+          ]
         }
       }
     ],
@@ -24734,7 +28744,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
+              "tool_call": "{\"type\": \"function\", \"name\": \"",
               "type": "tool_call"
             },
             "event_type": {
@@ -24759,7 +28769,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"poly",
+              "tool_call": "get_boiling_point\", \"parameters\": {\"liquid_name\": \"",
               "type": "tool_call"
             },
             "event_type": {
@@ -24784,7 +28794,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "juice\"}}",
+              "tool_call": "polyjuice\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -24813,7 +28823,7 @@
                 "arguments": {
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "83d9f330-4c7a-4dd3-8fcb-ccc5301c1f83",
+                "call_id": "98ea49c2-7788-4fda-8d84-4a584eb1dd27",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -24854,7 +28864,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 30
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 40
+            }
+          ]
         }
       }
     ],
@@ -24918,7 +28944,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\":",
+              "tool_call": "{\"type\": \"function\", \"name\": \"get",
               "type": "tool_call"
             },
             "event_type": {
@@ -24943,7 +28969,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " \"get_boiling_point\", \"parameters\": {\"liquid_name\":",
+              "tool_call": "_boiling_point\", \"parameters\": {\"liquid_name\": \"polyju",
               "type": "tool_call"
             },
             "event_type": {
@@ -24968,7 +28994,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " \"polyjuice\"}}",
+              "tool_call": "ice\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -24997,7 +29023,7 @@
                 "arguments": {
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "98c63572-06c8-4cc0-a14e-3b10fb9ddc19",
+                "call_id": "a9d6416d-a739-49b0-b7df-63694412a376",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -25038,7 +29064,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 30
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 40
+            }
+          ]
         }
       }
     ],
@@ -25092,7 +29134,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " couldn't find any information on the boiling point of Polyjuice",
+              "text": " couldn't find any information on the boiling point of",
               "type": "text"
             },
             "event_type": {
@@ -25112,7 +29154,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": ". Polyjuice is a magical potion in the Harry Potter series",
+              "text": " Polyjuice. Polyjuice is a magical potion in the",
               "type": "text"
             },
             "event_type": {
@@ -25132,7 +29174,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " that allows the drinker to transform into someone else. It's",
+              "text": " Harry Potter series that allows the drinker to transform into someone else",
               "type": "text"
             },
             "event_type": {
@@ -25152,7 +29194,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " not a physical substance with a boiling point. If you have any",
+              "text": ". It's not a physical substance with a boiling point. If",
               "type": "text"
             },
             "event_type": {
@@ -25172,7 +29214,27 @@
         "data": {
           "event": {
             "delta": {
-              "text": " other questions, I'd be happy to help.",
+              "text": " you have any other questions, I'd be happy to",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " help.",
               "type": "text"
             },
             "event_type": {
@@ -25207,7 +29269,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 30
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 73
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 103
+            }
+          ]
         }
       }
     ],
@@ -25271,7 +29349,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\":",
+              "tool_call": "{\"type\": \"function\", \"name\": \"",
               "type": "tool_call"
             },
             "event_type": {
@@ -25296,7 +29374,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " \"get_boiling_point\", \"parameters\": {\"liquid_name\":",
+              "tool_call": "get_boiling_point\", \"parameters\": {\"liquid_name\":",
               "type": "tool_call"
             },
             "event_type": {
@@ -25350,7 +29428,7 @@
                 "arguments": {
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "cdccc866-97a0-40fd-b6e2-a0555f0ed921",
+                "call_id": "722b4201-b9ac-440f-80c4-0e9a7f0b5369",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -25391,7 +29469,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 30
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 40
+            }
+          ]
         }
       }
     ],
@@ -25486,6 +29580,111 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"def is_prime(n):\\n    if n <= 1:\\n        return False\\n    if n <= 3:\\n        return True\\n    if n % 2 == 0 or n % 3 == 0:\\n        return False\\n    i = 5\\n    while i * i <= n:\\n        if n % i == 0 or n % (i + 2) == 0:\\n            return False\\n        i += 6\\n    return True\\n\\ndef get_nth_prime(n):\\n    count = 0\\n    num = 2\\n    while True:\\n        if is_prime(num):\\n            count += 1\\n            if count == n:\\n                return num\\n        num += 1\\n\\nprint(get_nth_prime(100))\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stdout]\\n541\\n[/stdout]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " 100th prime number is 541.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 217
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 20
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 237
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
     "chunks": [
       {
@@ -25544,7 +29743,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "def is_prime(n):\n    if n <= 1:\n       ",
+              "tool_call": "def is_prime(n):\n    if n <= 1:\n        return False\n",
               "type": "tool_call"
             },
             "event_type": {
@@ -25569,7 +29768,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " return False\n    if n <= 3:\n        return True",
+              "tool_call": "    if n <= 3:\n        return True\n    if n % ",
               "type": "tool_call"
             },
             "event_type": {
@@ -25594,7 +29793,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\n    if n % 2 == 0 or n % 3 == 0:\n        return False\n    i = 5\n    while i * i <= n:\n        if n % i == 0 or n % (i +",
+              "tool_call": "2 == 0 or n % 3 == 0:\n        return False",
               "type": "tool_call"
             },
             "event_type": {
@@ -25619,7 +29818,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " 2) == 0:\n            return False\n       ",
+              "tool_call": "\n    i = 5\n    while i * i <= n:\n       ",
               "type": "tool_call"
             },
             "event_type": {
@@ -25644,7 +29843,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " i += 6\n    return True\n\ndef get_nth_prime(n):\n    count",
+              "tool_call": " if n % i == 0 or n % (i + 2)",
               "type": "tool_call"
             },
             "event_type": {
@@ -25669,7 +29868,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " = 0\n    num = 2\n    while True:\n        if",
+              "tool_call": " == 0:\n            return False\n        i",
               "type": "tool_call"
             },
             "event_type": {
@@ -25694,7 +29893,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " is_prime(num):\n            count += 1",
+              "tool_call": " += 6\n    return True",
               "type": "tool_call"
             },
             "event_type": {
@@ -25719,7 +29918,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\n            if count == n:\n                return num\n        num += ",
+              "tool_call": "\n\n",
               "type": "tool_call"
             },
             "event_type": {
@@ -25744,7 +29943,82 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "1\n\nprint(get_nth_prime(100))",
+              "tool_call": "def get_nth_prime(n):\n    count = 0\n    num = 2\n    while True:\n        if is_prime(num):\n           ",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " count += 1\n            if count == n:\n                return",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " num\n        num += 1\n\nprint(get_nth_prime(100",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "))",
               "type": "tool_call"
             },
             "event_type": {
@@ -25773,7 +30047,7 @@
                 "arguments": {
                   "code": "def is_prime(n):\n    if n <= 1:\n        return False\n    if n <= 3:\n        return True\n    if n % 2 == 0 or n % 3 == 0:\n        return False\n    i = 5\n    while i * i <= n:\n        if n % i == 0 or n % (i + 2) == 0:\n            return False\n        i += 6\n    return True\n\ndef get_nth_prime(n):\n    count = 0\n    num = 2\n    while True:\n        if is_prime(num):\n            count += 1\n            if count == n:\n                return num\n        num += 1\n\nprint(get_nth_prime(100))"
                 },
-                "call_id": "7fca0515-82f3-46e1-bbec-eceb8fa5162e",
+                "call_id": "229f89b4-d07d-4748-b8ae-f805cb52209e",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -25818,7 +30092,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 40
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 50
+            }
+          ]
         }
       }
     ],
@@ -25872,7 +30162,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "plexity the company was founded in 2022",
+              "text": "plexity the company was founded in 202",
               "type": "text"
             },
             "event_type": {
@@ -25892,7 +30182,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": ".",
+              "text": "2.",
               "type": "text"
             },
             "event_type": {
@@ -25927,7 +30217,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 105
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 22
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 127
+            }
+          ]
         }
       }
     ],
@@ -25981,7 +30287,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "type\": \"function\", \"name\": \"knowledge_search\",",
+              "text": "type\": \"function\", \"name\": \"knowledge_search\", \"",
               "type": "text"
             },
             "event_type": {
@@ -26001,27 +30307,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " \"parameters\": {\"query\": \"Perplexity company founding",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " date\"}}",
+              "text": "parameters\": {\"query\": \"Perplexity company founding date\"}}",
               "type": "text"
             },
             "event_type": {
@@ -26050,7 +30336,7 @@
                 "arguments": {
                   "query": "Perplexity company founding date"
                 },
-                "call_id": "ca248109-25af-4737-90cb-6461faaf4e63",
+                "call_id": "5ea88dde-f090-4157-9219-45a16100ef21",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -26091,7 +30377,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 67
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 37
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 104
+            }
+          ]
         }
       }
     ],
@@ -26180,32 +30482,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\": {\"query\": \"Perplexity",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": " company founding date\"}}",
+              "tool_call": "\": {\"query\": \"Perplexity company founding date\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -26234,7 +30511,7 @@
                 "arguments": {
                   "query": "Perplexity company founding date"
                 },
-                "call_id": "94a9fd55-7658-482d-8595-d2c2a23b3a1e",
+                "call_id": "06c95bef-9b2d-4380-bf16-e1338bb7cf2c",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -26275,7 +30552,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 29
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 39
+            }
+          ]
         }
       }
     ],
@@ -26438,7 +30731,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "{\"",
+              "text": "The",
               "type": "text"
             },
             "event_type": {
@@ -26458,7 +30751,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
+              "text": " NBA was created on August 3, 1949, with",
               "type": "text"
             },
             "event_type": {
@@ -26478,7 +30771,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "\": {\"query\": \"when was the nba created\"}}",
+              "text": " the merger of the Basketball Association of America (BAA) and",
               "type": "text"
             },
             "event_type": {
@@ -26498,19 +30791,8 @@
         "data": {
           "event": {
             "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "succeeded"
-              },
-              "tool_call": {
-                "arguments": {
-                  "query": "when was the nba created"
-                },
-                "call_id": "7b01a40d-a6a8-4c86-b91d-1790e7480e57",
-                "tool_name": "knowledge_search"
-              },
-              "type": "tool_call"
+              "text": " the National Basketball League (NBL).",
+              "type": "text"
             },
             "event_type": {
               "__enum__": "ChatCompletionResponseEventType",
@@ -26518,11 +30800,7 @@
               "value": "progress"
             },
             "logprobs": null,
-            "stop_reason": {
-              "__enum__": "StopReason",
-              "__module__": "llama_stack.models.llama.datatypes",
-              "value": "end_of_turn"
-            }
+            "stop_reason": null
           },
           "metrics": null
         }
@@ -26548,7 +30826,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 65
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 45
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 110
+            }
+          ]
         }
       }
     ],
@@ -26612,7 +30906,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",",
+              "tool_call": "{\"type\": \"function\", \"",
               "type": "tool_call"
             },
             "event_type": {
@@ -26637,7 +30931,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " \"parameters\": {\"query\": \"when was the nba created",
+              "tool_call": "name\": \"knowledge_search\", \"parameters\": {\"query\": \"",
               "type": "tool_call"
             },
             "event_type": {
@@ -26662,7 +30956,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\"}}",
+              "tool_call": "when was the nba created\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -26691,7 +30985,7 @@
                 "arguments": {
                   "query": "when was the nba created"
                 },
-                "call_id": "bbaf750a-0337-4c83-9bf2-76c2f72d45c3",
+                "call_id": "b08bb4c0-c0a1-4063-b110-3947559e4061",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -26732,7 +31026,23 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": null
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 27
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 37
+            }
+          ]
         }
       }
     ],
diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.json b/tests/integration/fixtures/recorded_responses/invoke_tool.json
index 76191e992..30a132904 100644
--- a/tests/integration/fixtures/recorded_responses/invoke_tool.json
+++ b/tests/integration/fixtures/recorded_responses/invoke_tool.json
@@ -5,7 +5,7 @@
       "__module__": "llama_stack.apis.tools.tools",
       "__pydantic__": "ToolInvocationResult",
       "data": {
-        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+        "content": "completed\n[stdout]\n541\n[/stdout]",
         "error_code": null,
         "error_message": null,
         "metadata": null
@@ -31,7 +31,7 @@
       "__module__": "llama_stack.apis.tools.tools",
       "__pydantic__": "ToolInvocationResult",
       "data": {
-        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+        "content": "completed\n[stdout]\nNumber of rows and columns in the data: (10, 13)\nColumns of the data are: 13\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\n       'Oct', 'Nov', 'Dec'],\n      dtype='object')\nDatatype of the columns are: Year      int64\nJan     float64\nFeb     float64\nMar     float64\nApr     float64\nMay     float64\nJun     float64\nJul     float64\nAug     float64\nSep     float64\nOct     float64\nNov     float64\nDec     float64\ndtype: object\n[/stdout]",
         "error_code": null,
         "error_message": null,
         "metadata": null
@@ -70,7 +70,7 @@
       "__module__": "llama_stack.apis.tools.tools",
       "__pydantic__": "ToolInvocationResult",
       "data": {
-        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+        "content": "completed\n[stdout]\nYear  Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec\n0  2014  1.6  1.6  1.7  1.8  2.0  1.9  1.9  1.7  1.7  1.8  1.7  1.6\n1  2015  1.6  1.7  1.8  1.8  1.7  1.8  1.8  1.8  1.9  1.9  2.0  2.1\n2  2016  2.2  2.3  2.2  2.1  2.2  2.2  2.2  2.3  2.2  2.1  2.1  2.2\n3  2017  2.3  2.2  2.0  1.9  1.7  1.7  1.7  1.7  1.7  1.8  1.7  1.8\n4  2018  1.8  1.8  2.1  2.1  2.2  2.3  2.4  2.2  2.2  2.1  2.2  2.2\n[/stdout]",
         "error_code": null,
         "error_message": null,
         "metadata": null
@@ -83,7 +83,7 @@
       "__module__": "llama_stack.apis.tools.tools",
       "__pydantic__": "ToolInvocationResult",
       "data": {
-        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 142, in <module>\n  line 23, in <module>\n    from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\nImportError: attempted relative import with no known parent package\n[/stderr]",
         "error_code": null,
         "error_message": null,
         "metadata": null
@@ -116,6 +116,19 @@
       }
     }
   },
+  "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n\\n# Calculate average yearly inflation\\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\\n\\n# Plot time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df['Year'], df['Average'])\\nplt.xlabel('Year')\\nplt.ylabel('Average Yearly Inflation')\\nplt.title('Average Yearly Inflation Over Time')\\nplt.grid(True)\\nplt.show()\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"code_interpreter\"}]": {
+    "type": "value",
+    "value": {
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": "completed",
+        "error_code": null,
+        "error_message": null,
+        "metadata": null
+      }
+    }
+  },
   "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"inflation.csv\\\")\\n\\n# Convert date column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"code_interpreter\"}]": {
     "type": "value",
     "value": {
@@ -154,23 +167,23 @@
             "type": "text"
           },
           {
-            "text": "Result 1:\nDocument_id:961ff\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+            "text": "Result 1:\nDocument_id:cc255\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
             "type": "text"
           },
           {
-            "text": "Result 2:\nDocument_id:961ff\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
+            "text": "Result 2:\nDocument_id:cc255\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
             "type": "text"
           },
           {
-            "text": "Result 3:\nDocument_id:961ff\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+            "text": "Result 3:\nDocument_id:cc255\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
             "type": "text"
           },
           {
-            "text": "Result 4:\nDocument_id:961ff\nContent:  from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n  # Assuming that base_model already has the pretrained Llama2 weights,\n  # this will directly load them into your LoRA model without any conversion necessary.\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n  # Fetch all params from the model that are associated with LoRA.\n  lora_params = get_adapter_params(lora_model)\n\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n  set_trainable_params(lora_model, lora_params)\n\n  # Print the total number of parameters\n  total_params = sum([p.numel() for p in lora_model.parameters()])\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n  print(\n    f\"\"\"\n    {total_params} total params,\n    {trainable_params}\" trainable params,\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n    \"\"\"\n  )\n\n  6742609920 total params,\n  4194304 trainable params,\n  0.06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
+            "text": "Result 4:\nDocument_id:cc255\nContent:  from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n  # Assuming that base_model already has the pretrained Llama2 weights,\n  # this will directly load them into your LoRA model without any conversion necessary.\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n  # Fetch all params from the model that are associated with LoRA.\n  lora_params = get_adapter_params(lora_model)\n\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n  set_trainable_params(lora_model, lora_params)\n\n  # Print the total number of parameters\n  total_params = sum([p.numel() for p in lora_model.parameters()])\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n  print(\n    f\"\"\"\n    {total_params} total params,\n    {trainable_params}\" trainable params,\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n    \"\"\"\n  )\n\n  6742609920 total params,\n  4194304 trainable params,\n  0.06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
             "type": "text"
           },
           {
-            "text": "Result 5:\nDocument_id:961ff\nContent: ,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
+            "text": "Result 5:\nDocument_id:cc255\nContent: ,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
             "type": "text"
           },
           {
@@ -182,11 +195,11 @@
         "error_message": null,
         "metadata": {
           "document_ids": [
-            "961ff2d1-8887-41ef-a4fe-fa4cbab7b932",
-            "961ff2d1-8887-41ef-a4fe-fa4cbab7b932",
-            "961ff2d1-8887-41ef-a4fe-fa4cbab7b932",
-            "961ff2d1-8887-41ef-a4fe-fa4cbab7b932",
-            "961ff2d1-8887-41ef-a4fe-fa4cbab7b932"
+            "cc2559a9-2b56-43d8-9ec4-b2181bb96acb",
+            "cc2559a9-2b56-43d8-9ec4-b2181bb96acb",
+            "cc2559a9-2b56-43d8-9ec4-b2181bb96acb",
+            "cc2559a9-2b56-43d8-9ec4-b2181bb96acb",
+            "cc2559a9-2b56-43d8-9ec4-b2181bb96acb"
           ]
         }
       }
@@ -242,6 +255,19 @@
       }
     }
   },
+  "[[], {\"kwargs\": {\"query\": \"Meta founder\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"web_search\"}]": {
+    "type": "value",
+    "value": {
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": "{\"query\": \"Meta founder\", \"top_k\": [{\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.81595254, \"raw_content\": null}, {\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\u2018Boz\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\", \"score\": 0.70726365, \"raw_content\": null}, {\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\", \"score\": 0.467308, \"raw_content\": null}, {\"title\": \"Meta Platforms - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Meta_Platforms\", \"content\": \"Following a period of intense scrutiny and damaging whistleblower leaks, news started to emerge on October 21, 2021, about Facebook's plan to rebrand the company and change its name.[15][54] In the Q3 2021 Earnings Call on October 25, Mark Zuckerberg discussed the ongoing criticism of the company's social services and the way it operates, and pointed to the pivoting efforts to building the metaverse \\u2013 without mentioning the rebranding and the name change.[55] The metaverse vision and the name change from Facebook, Inc. to Meta Platforms was introduced at Facebook Connect on October 28, 2021.[16] Based on Facebook's PR campaign, the name change reflects the company's shifting long term focus of building the metaverse, a digital extension of the physical world by social media, virtual reality and augmented reality features.[16][56]\", \"score\": 0.14999175, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.03678684, \"raw_content\": null}]}",
+        "error_code": null,
+        "error_message": null,
+        "metadata": null
+      }
+    }
+  },
   "[[], {\"kwargs\": {\"query\": \"NBA creation date\", \"session_id\": \"<UUID>\", \"vector_db_ids\": [\"test-vector-db-<UUID>\"]}, \"tool_name\": \"knowledge_search\"}]": {
     "type": "value",
     "value": {
@@ -374,23 +400,23 @@
             "type": "text"
           },
           {
-            "text": "Result 1:\nDocument_id:24443\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
+            "text": "Result 1:\nDocument_id:16a6a\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
             "type": "text"
           },
           {
-            "text": "Result 2:\nDocument_id:961ff\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+            "text": "Result 2:\nDocument_id:cc255\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
             "type": "text"
           },
           {
-            "text": "Result 3:\nDocument_id:b49f7\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+            "text": "Result 3:\nDocument_id:7a06a\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
             "type": "text"
           },
           {
-            "text": "Result 4:\nDocument_id:961ff\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+            "text": "Result 4:\nDocument_id:cc255\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
             "type": "text"
           },
           {
-            "text": "Result 5:\nDocument_id:b49f7\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+            "text": "Result 5:\nDocument_id:7a06a\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
             "type": "text"
           },
           {
@@ -402,11 +428,11 @@
         "error_message": null,
         "metadata": {
           "document_ids": [
-            "24443dfb-a0b3-4ce8-820e-3fb1f12364bb",
-            "961ff2d1-8887-41ef-a4fe-fa4cbab7b932",
-            "b49f7985-6615-4dcf-99be-d1765b6a6fc6",
-            "961ff2d1-8887-41ef-a4fe-fa4cbab7b932",
-            "b49f7985-6615-4dcf-99be-d1765b6a6fc6"
+            "16a6ae01-049e-4a44-b305-8248d20a8f7d",
+            "cc2559a9-2b56-43d8-9ec4-b2181bb96acb",
+            "7a06a3a9-7e9d-4693-8c07-15343f0654aa",
+            "cc2559a9-2b56-43d8-9ec4-b2181bb96acb",
+            "7a06a3a9-7e9d-4693-8c07-15343f0654aa"
           ]
         }
       }

From a505bf45a34891c1ed88633abeb88248db92889f Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Wed, 12 Mar 2025 19:41:48 -0700
Subject: [PATCH 09/38] feat(api): remove tool_name from ToolResponseMessage
 (#1599)

Summary:
This is not used anywhere.

closes #1421

Test Plan:
LLAMA_STACK_CONFIG=fireworks pytest -s -v
tests/integration/agents/test_agents.py --safety-shield
meta-llama/Llama-Guard-3-8B --text-model
meta-llama/Llama-3.1-8B-Instruct --record-responses
---
 docs/_static/llama-stack-spec.html            |   39 +-
 docs/_static/llama-stack-spec.yaml            |   28 +-
 llama_stack/apis/inference/inference.py       |    2 -
 .../agents/meta_reference/agent_instance.py   |    8 +-
 .../recorded_responses/chat_completion.json   | 8712 ++++++++++++++++-
 .../recorded_responses/invoke_tool.json       |   40 +-
 6 files changed, 8637 insertions(+), 192 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index dbd530aa3..c50554092 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -4347,24 +4347,6 @@
                         "type": "string",
                         "description": "Unique identifier for the tool call this response is for"
                     },
-                    "tool_name": {
-                        "oneOf": [
-                            {
-                                "type": "string",
-                                "enum": [
-                                    "brave_search",
-                                    "wolfram_alpha",
-                                    "photogen",
-                                    "code_interpreter"
-                                ],
-                                "title": "BuiltinTool"
-                            },
-                            {
-                                "type": "string"
-                            }
-                        ],
-                        "description": "Name of the tool that was called"
-                    },
                     "content": {
                         "$ref": "#/components/schemas/InterleavedContent",
                         "description": "The response content from the tool"
@@ -4374,7 +4356,6 @@
                 "required": [
                     "role",
                     "call_id",
-                    "tool_name",
                     "content"
                 ],
                 "title": "ToolResponseMessage",
@@ -4673,12 +4654,22 @@
             "CompletionResponse": {
                 "type": "object",
                 "properties": {
+<<<<<<< dest:   ed6caead724a - ehhuang: chore: simplify _get_tool_defs (#1384)
                     "metrics": {
                         "type": "array",
                         "items": {
                             "$ref": "#/components/schemas/MetricInResponse"
                         }
                     },
+||||||| base:   1311faf3f5e7 - ehhuang: fix: logging (#1598)
+=======
+                    "metrics": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/MetricEvent"
+                        }
+                    },
+>>>>>>> source: ad32270ad0d5 - erichuang: feat(api): remove tool_name from To...
                     "content": {
                         "type": "string",
                         "description": "The generated completion text"
@@ -5046,12 +5037,22 @@
             "CompletionResponseStreamChunk": {
                 "type": "object",
                 "properties": {
+<<<<<<< dest:   ed6caead724a - ehhuang: chore: simplify _get_tool_defs (#1384)
                     "metrics": {
                         "type": "array",
                         "items": {
                             "$ref": "#/components/schemas/MetricInResponse"
                         }
                     },
+||||||| base:   1311faf3f5e7 - ehhuang: fix: logging (#1598)
+=======
+                    "metrics": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/MetricEvent"
+                        }
+                    },
+>>>>>>> source: ad32270ad0d5 - erichuang: feat(api): remove tool_name from To...
                     "delta": {
                         "type": "string",
                         "description": "New content generated since last chunk. This can be one or more tokens."
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index cca1872a4..1f9536c2e 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -2943,17 +2943,6 @@ components:
           type: string
           description: >-
             Unique identifier for the tool call this response is for
-        tool_name:
-          oneOf:
-            - type: string
-              enum:
-                - brave_search
-                - wolfram_alpha
-                - photogen
-                - code_interpreter
-              title: BuiltinTool
-            - type: string
-          description: Name of the tool that was called
         content:
           $ref: '#/components/schemas/InterleavedContent'
           description: The response content from the tool
@@ -2961,7 +2950,6 @@ components:
       required:
         - role
         - call_id
-        - tool_name
         - content
       title: ToolResponseMessage
       description: >-
@@ -3188,10 +3176,18 @@ components:
     CompletionResponse:
       type: object
       properties:
+<<<<<<< dest:   ed6caead724a - ehhuang: chore: simplify _get_tool_defs (#1384)
         metrics:
           type: array
           items:
             $ref: '#/components/schemas/MetricInResponse'
+||||||| base:   1311faf3f5e7 - ehhuang: fix: logging (#1598)
+=======
+        metrics:
+          type: array
+          items:
+            $ref: '#/components/schemas/MetricEvent'
+>>>>>>> source: ad32270ad0d5 - erichuang: feat(api): remove tool_name from To...
         content:
           type: string
           description: The generated completion text
@@ -3510,10 +3506,18 @@ components:
     CompletionResponseStreamChunk:
       type: object
       properties:
+<<<<<<< dest:   ed6caead724a - ehhuang: chore: simplify _get_tool_defs (#1384)
         metrics:
           type: array
           items:
             $ref: '#/components/schemas/MetricInResponse'
+||||||| base:   1311faf3f5e7 - ehhuang: fix: logging (#1598)
+=======
+        metrics:
+          type: array
+          items:
+            $ref: '#/components/schemas/MetricEvent'
+>>>>>>> source: ad32270ad0d5 - erichuang: feat(api): remove tool_name from To...
         delta:
           type: string
           description: >-
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index fa917ac22..0a4324cdf 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -117,13 +117,11 @@ class ToolResponseMessage(BaseModel):
 
     :param role: Must be "tool" to identify this as a tool response
     :param call_id: Unique identifier for the tool call this response is for
-    :param tool_name: Name of the tool that was called
     :param content: The response content from the tool
     """
 
     role: Literal["tool"] = "tool"
     call_id: str
-    tool_name: Union[BuiltinTool, str]
     content: InterleavedContent
 
 
diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
index 3f09cacc0..0ae1996cc 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@@ -153,7 +153,6 @@ class ChatAgent(ShieldRunnerMixin):
                     messages.append(
                         ToolResponseMessage(
                             call_id=response.call_id,
-                            tool_name=response.tool_name,
                             content=response.content,
                         )
                     )
@@ -221,8 +220,7 @@ class ChatAgent(ShieldRunnerMixin):
         messages = await self.get_messages_from_turns(turns)
         if is_resume:
             tool_response_messages = [
-                ToolResponseMessage(call_id=x.call_id, tool_name=x.tool_name, content=x.content)
-                for x in request.tool_responses
+                ToolResponseMessage(call_id=x.call_id, content=x.content) for x in request.tool_responses
             ]
             messages.extend(tool_response_messages)
             last_turn = turns[-1]
@@ -685,7 +683,6 @@ class ChatAgent(ShieldRunnerMixin):
                     result_messages = [
                         ToolResponseMessage(
                             call_id=tool_call.call_id,
-                            tool_name=tool_call.tool_name,
                             content=tool_result.content,
                         )
                     ]
@@ -705,7 +702,7 @@ class ChatAgent(ShieldRunnerMixin):
                                 tool_responses=[
                                     ToolResponse(
                                         call_id=result_message.call_id,
-                                        tool_name=result_message.tool_name,
+                                        tool_name=tool_call.tool_name,
                                         content=result_message.content,
                                         metadata=tool_result.metadata,
                                     )
@@ -999,7 +996,6 @@ async def attachment_message(tempdir: str, urls: List[URL]) -> ToolResponseMessa
 
     return ToolResponseMessage(
         call_id="",
-        tool_name=BuiltinTool.code_interpreter,
         content=content,
     )
 
diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.json b/tests/integration/fixtures/recorded_responses/chat_completion.json
index 30b7e0b4d..8694cc271 100644
--- a/tests/integration/fixtures/recorded_responses/chat_completion.json
+++ b/tests/integration/fixtures/recorded_responses/chat_completion.json
@@ -12758,6 +12758,292 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"false\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " boiling point of polyjuice is -100 degrees Fahrenheit.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 139
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 23
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 162
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"get_boiling_point\", \"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "parameters\": {\"liquid_name\": \"polyjuice\", \"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "celcius\": \"false\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "celcius": "false",
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "1ef7adda-5ebb-41d5-a2c6-3e6700de5f81",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 91
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 45
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 136
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -12806,7 +13092,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "    \"type\": \"function\",\n    \"",
+              "text": "    \"type\": \"function\",\n    \"name\": \"get",
               "type": "text"
             },
             "event_type": {
@@ -12826,7 +13112,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "name\": \"get_boiling_point\",\n    \"parameters\":",
+              "text": "_boiling_point\",\n    \"parameters\": {\n        \"liquid",
               "type": "text"
             },
             "event_type": {
@@ -12846,7 +13132,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " {\n        \"liquid_name\": \"polyjuice\",\n        \"celci",
+              "text": "_name\": \"polyjuice\",\n        \"celci",
               "type": "text"
             },
             "event_type": {
@@ -12896,7 +13182,7 @@
                   "celcius": "true",
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "73212def-09c0-4a29-845e-149afb38fcd1",
+                "call_id": "40293d5b-8a76-4df5-8325-d6e8755ba513",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -13084,6 +13370,111 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " boiling point of polyjuice is -100\u00b0C.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 85
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 22
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 107
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": {
     "chunks": [
       {
@@ -13209,6 +13600,131 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " boiling point of polyjuice is -",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "100 degrees Celcius.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 87
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 25
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 112
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -13267,7 +13783,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name",
+              "tool_call": "{\"type\": \"function\", \"",
               "type": "tool_call"
             },
             "event_type": {
@@ -13292,7 +13808,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\": \"get_boiling_point\", \"parameters\": {\"liquid_name\": \"poly",
+              "tool_call": "name\": \"get_boiling_point\", \"parameters",
               "type": "tool_call"
             },
             "event_type": {
@@ -13317,7 +13833,32 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "juice\", \"celcius\": \"true\"}}",
+              "tool_call": "\": {\"liquid_name\": \"polyjuice\", \"celci",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "us\": \"true\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -13347,7 +13888,7 @@
                   "celcius": "true",
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "e4b0121a-7b75-4e89-be40-d13021a3bb11",
+                "call_id": "f146d04b-c400-4193-a6d8-ccfea7f7b529",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -13468,7 +14009,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point",
+              "tool_call": "{\"type\": \"function\", \"name\": \"",
               "type": "tool_call"
             },
             "event_type": {
@@ -13493,7 +14034,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "_with_metadata\", \"parameters\": {\"liquid_name\": \"polyjuice\",",
+              "tool_call": "get_boiling_point_with_metadata\", \"parameters\": {\"liquid",
               "type": "tool_call"
             },
             "event_type": {
@@ -13518,7 +14059,57 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " \"celcius\": \"true\"}}",
+              "tool_call": "_name\": \"polyjuice\", \"celci",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "us\": \"",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "true\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -13548,7 +14139,7 @@
                   "celcius": "true",
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "11c04896-2b7b-49bd-b832-47a1c9f3796f",
+                "call_id": "d6b8a25d-9b4c-4650-bbe6-f94b5fa97e56",
                 "tool_name": "get_boiling_point_with_metadata"
               },
               "type": "tool_call"
@@ -13659,7 +14250,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " customer smiled and said \"hello\" to the",
+              "text": " customer smiled and said \"hello\" to the friendly store",
               "type": "text"
             },
             "event_type": {
@@ -13679,7 +14270,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " friendly store clerk.",
+              "text": " clerk.",
               "type": "text"
             },
             "event_type": {
@@ -16564,6 +17155,1609 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\nprint(df.head())\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stdout]\\nYear  Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec\\n0  2014  1.6  1.6  1.7  1.8  2.0  1.9  1.9  1.7  1.7  1.8  1.7  1.6\\n1  2015  1.6  1.7  1.8  1.8  1.7  1.8  1.8  1.8  1.9  1.9  2.0  2.1\\n2  2016  2.2  2.3  2.2  2.1  2.2  2.2  2.2  2.3  2.2  2.1  2.1  2.2\\n3  2017  2.3  2.2  2.0  1.9  1.7  1.7  1.7  1.7  1.7  1.8  1.7  1.8\\n4  2018  1.8  1.8  2.1  2.1  2.2  2.3  2.4  2.2  2.2  2.1  2.2  2.2\\n[/stdout]\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " csv file contains a table with 12 columns (Jan to",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Dec) and 5 rows (2014 to",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " 2018). The values in the table represent the inflation rate",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " for each month of the year from 2014",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " to 2018.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 469
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 61
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 530
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 142, in <module>\\n  line 23, in <module>\\n    from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\\nImportError: attempted relative import with no known parent package\\n[/stderr]\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 142, in <module>\\n  line 23, in <module>\\n    from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\\nImportError: attempted relative import with no known parent package\\n[/stderr]\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " error message indicates that there is an issue with",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the import statement. However, the code provided does not contain any",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " import statements that would cause this error.\n\nTo provide a more accurate",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " answer, I would need to know the contents of the CSV file",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " or more information about the error message.\n\nHowever, based on the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " code provided, it seems like the code is trying to load a",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " CSV file and print some basic information about it. If the file",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " is not found or there is an issue with the file path,",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " this could cause an error.\n\nHere is a",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " revised version of the code that includes some error",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " handling:\n\n```\nimport pandas as pd\nimport code_interpreter",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\n\ntry:\n    # Load the CSV file",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\n    df = pd.read_csv(\"/var/folders/cz",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "/vyh7y1d11xg",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "881lsxsshnc5",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "c0000gn/T/tmpflpgiagc/",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "8S20Zj2Oinflation.csv\")\n\n   ",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " # Print the first few rows of the dataframe\n    print(df.head",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "())\n\n    # Print the data types of each column\n    print",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "(df.dtypes)\n\n    # Print the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " summary statistics of the dataframe\n   ",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " print(df.describe())\n\nexcept FileNotFoundError:\n    print(\"The file",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " was not found.\")\nexcept pd.errors.EmptyDataError",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ":\n    print(\"The file is empty.\")\nexcept pd.errors.ParserError",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ":\n    print(\"An error occurred while parsing the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " file.\")\nexcept Exception as e:\n    print",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "(\"An error occurred: \", str(e))\n``",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "`\n\nThis code will catch specific exceptions that could occur when loading the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " CSV file and print a more",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " informative error message.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 393
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 331
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 724
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 142, in <module>\\n  line 23, in <module>\\n    from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\\nImportError: attempted relative import with no known parent package\\n[/stderr]\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "import pandas as pd\nimport code_interpreter\n\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "# Load the CSV file\ndf = pd.read_csv(\"/var/f",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "olders/cz/vyh7y1d11xg881lsx",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "sshnc5c0000gn/T/tmpfl",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "pgiagc/8S20Zj2Oinflation",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ".csv\")\n\n# Print the first few rows of the",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " dataframe\nprint(df.head())\n\n# Print the data types of each",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " column\nprint(df.dtypes)\n\n#",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " Print the summary statistics of the dataframe\nprint(df.describe())",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/8S20Zj2Oinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())"
+                },
+                "call_id": "e999a578-cbd8-4bb8-bc53-deb2fff1ffce",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "code_interpreter"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 215
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 225
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "import pandas as pd\nimport code_interpreter\n\n# Load the",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " CSV file\ndf = pd.read",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "_csv(\"/var/folders/cz/vyh",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "7y1d11xg881lsxsshnc5c",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "0000gn/T/tmpflpgiagc/8S",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "20Zj2Oinflation.csv\")\n\n# Print the first",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " few rows of the dataframe\nprint(df.head())\n\n#",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " Print the data types of each column\nprint",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "(df.dtypes)\n\n# Print the summary statistics of the dataframe",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\nprint(df.describe())",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/8S20Zj2Oinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())"
+                },
+                "call_id": "ea72d524-2d0f-4220-a898-4c295315235e",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "code_interpreter"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 37
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 47
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If the file is in your current directory, you can use the following code:\\n\\n```python\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n# Print the first 5 rows of the dataframe\\nprint(df.head())\\n# Print the summary of the dataframe\\nprint(df.info())\\nprint(df.describe())\\n```\\n\\nThis will print the first 5 rows of the dataframe, the summary of the dataframe (including the index dtype and column count), and the description of the dataframe (including count, mean, std, min, 25%, 50%, 75%, max for each column).\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert 'date' column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot the time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
     "chunks": [
       {
@@ -19488,6 +21682,1269 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n       'Oct', 'Nov', 'Dec'],\\n      dtype='object')\\nDatatype of the columns are: Year      int64\\nJan     float64\\nFeb     float64\\nMar     float64\\nApr     float64\\nMay     float64\\nJun     float64\\nJul     float64\\nAug     float64\\nSep     float64\\nOct     float64\\nNov     float64\\nDec     float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"This CSV file contains 10 rows and 13 columns. The columns are named 'Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'. The data types of these columns are int64 for 'Year' and float64 for the rest.\\n\\nIt appears that this CSV file contains monthly inflation rates for different years. The 'Year' column represents the year, and the rest of the columns represent the inflation rates for each month of the year.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n\\n# Calculate average yearly inflation\\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\\n\\n# Plot time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df['Year'], df['Average'])\\nplt.xlabel('Year')\\nplt.ylabel('Average Yearly Inflation')\\nplt.title('Average Yearly Inflation Over Time')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "This",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " code will create a line plot of the average yearly inflation over time. The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " x-axis represents the year, and the y-axis represents the average yearly inflation",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ". The plot will show the trend of average yearly inflation over the years",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 635
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 56
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 691
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n       'Oct', 'Nov', 'Dec'],\\n      dtype='object')\\nDatatype of the columns are: Year      int64\\nJan     float64\\nFeb     float64\\nMar     float64\\nApr     float64\\nMay     float64\\nJun     float64\\nJul     float64\\nAug     float64\\nSep     float64\\nOct     float64\\nNov     float64\\nDec     float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"This CSV file contains 10 rows and 13 columns. The columns are named 'Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'. The data types of these columns are int64 for 'Year' and float64 for the rest.\\n\\nIt appears that this CSV file contains monthly inflation rates for different years. The 'Year' column represents the year, and the rest of the columns represent the inflation rates for each month of the year.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "import pandas as pd\nimport matplotlib.pyplot as",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " plt\n\n# Load data\ndf = pd.read_csv(\"/var/f",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "olders/cz/vyh7y1d11xg881lsx",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "sshnc5c0000gn/T/tmpflpgiagc/",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "2VkeqrPlinflation.csv\")\n\n# Calculate average yearly inflation",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\ndf['Average'] = df[['Jan', 'Feb', '",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "Mar', 'Apr', 'May', 'Jun', 'Jul',",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " 'Aug', 'Sep', 'Oct', 'Nov', 'Dec",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "']].mean(axis=1)\n\n# Plot time series\nplt.figure(figsize",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "=(10,6))\nplt.plot(df['Year'], df['Average",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "'])\nplt.xlabel('Year')\nplt.ylabel('Average Yearly",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " Inflation')\nplt.title('Average Yearly Inflation Over",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " Time')\nplt.grid(True)\nplt.show()",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/2VkeqrPlinflation.csv\")\n\n# Calculate average yearly inflation\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\n\n# Plot time series\nplt.figure(figsize=(10,6))\nplt.plot(df['Year'], df['Average'])\nplt.xlabel('Year')\nplt.ylabel('Average Yearly Inflation')\nplt.title('Average Yearly Inflation Over Time')\nplt.grid(True)\nplt.show()"
+                },
+                "call_id": "f82fa3fd-e3be-4cb7-9298-8b4625cf709e",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "code_interpreter"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 454
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 464
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n       'Oct', 'Nov', 'Dec'],\\n      dtype='object')\\nDatatype of the columns are: Year      int64\\nJan     float64\\nFeb     float64\\nMar     float64\\nApr     float64\\nMay     float64\\nJun     float64\\nJul     float64\\nAug     float64\\nSep     float64\\nOct     float64\\nNov     float64\\nDec     float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"This CSV file contains 10 rows and 13 columns. The columns are named 'Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'. The data types of these columns are int64 for 'Year' and float64 for the rest.\\n\\nThe 'Year' column likely contains the year for which the inflation rates are given. The other columns ('Jan' to 'Dec') likely contain the inflation rates for each month of the year.\\n\\nPlease note that the actual data in the CSV file is not provided, so the above description is based on the structure of the file.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n\\n# Calculate average yearly inflation\\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\\n\\n# Plot time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df['Year'], df['Average'])\\nplt.xlabel('Year')\\nplt.ylabel('Average Yearly Inflation')\\nplt.title('Average Yearly Inflation Over Time')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "This",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " code will create a line plot of the average yearly inflation over",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " time. The x-axis represents the year and the y-axis",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " represents the average yearly inflation. The plot will show the trend",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " of average yearly inflation over the years.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 661
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 55
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 716
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n       'Oct', 'Nov', 'Dec'],\\n      dtype='object')\\nDatatype of the columns are: Year      int64\\nJan     float64\\nFeb     float64\\nMar     float64\\nApr     float64\\nMay     float64\\nJun     float64\\nJul     float64\\nAug     float64\\nSep     float64\\nOct     float64\\nNov     float64\\nDec     float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"This CSV file contains 10 rows and 13 columns. The columns are named 'Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'. The data types of these columns are int64 for 'Year' and float64 for the rest.\\n\\nThe 'Year' column likely contains the year for which the inflation rates are given. The other columns ('Jan' to 'Dec') likely contain the inflation rates for each month of the year.\\n\\nPlease note that the actual data in the CSV file is not provided, so the above description is based on the structure of the file.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "df = pd.read_csv(\"/var/folders/cz/vyh7",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "y1d11xg881lsxsshnc5c0000",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "gn/T/tmpfsp7c9_g/Aih5TPOuin",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "flation.csv\")\n\n# Calculate average yearly inflation",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\ndf['Average'] = df[['Jan', 'Feb',",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " 'Mar', 'Apr', 'May', 'Jun', '",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "Jul', 'Aug', 'Sep', 'Oct', 'Nov",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "', 'Dec']].mean(axis=1)\n\n# Plot time series",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\nplt.figure(figsize=(10,6))\nplt.plot(df['",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "Year'], df['Average'])\nplt.xlabel('Year')\nplt.ylabel",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "('Average Yearly Inflation')\nplt.title('",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "Average Yearly Inflation Over Time')\nplt.grid(True)\nplt",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ".show()",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpfsp7c9_g/Aih5TPOuinflation.csv\")\n\n# Calculate average yearly inflation\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\n\n# Plot time series\nplt.figure(figsize=(10,6))\nplt.plot(df['Year'], df['Average'])\nplt.xlabel('Year')\nplt.ylabel('Average Yearly Inflation')\nplt.title('Average Yearly Inflation Over Time')\nplt.grid(True)\nplt.show()"
+                },
+                "call_id": "dce1b106-06e1-4163-ae85-f9a2491f4375",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "code_interpreter"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 480
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 490
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
     "chunks": [
       {
@@ -20531,6 +23988,645 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n       'Oct', 'Nov', 'Dec'],\\n      dtype='object')\\nDatatype of the columns are: Year      int64\\nJan     float64\\nFeb     float64\\nMar     float64\\nApr     float64\\nMay     float64\\nJun     float64\\nJul     float64\\nAug     float64\\nSep     float64\\nOct     float64\\nNov     float64\\nDec     float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "This",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " CSV file contains 10 rows and 13 columns. The columns are",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " named 'Year', 'Jan', 'Feb', 'Mar', '",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "Apr', 'May', 'Jun', 'Jul', 'Aug',",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " 'Sep', 'Oct', 'Nov', 'Dec'. The data",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " types of these columns are int64 for 'Year",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "' and float64 for the rest.\n\nIt appears that this CSV file",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " contains monthly inflation rates for different years. The 'Year' column represents",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the year, and the rest of the columns represent the inflation rates",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " for each month of the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " year.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 327
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 125
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 452
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "import pandas as pd\n# Load data\ndf = pd.read",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "_csv(\"/var/folders/cz/vyh7",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "y1d11xg881lsxsshnc5c000",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "0gn/T/tmpflpgiagc/2VkeqrPlinflation",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ".csv\")\n# Rows\nprint(\"Number of rows and columns in",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ":\", len(df.columns))\n# Column names\nprint(\"Columns of the data",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " the columns are:\", df.dtypes)",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/2VkeqrPlinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)"
+                },
+                "call_id": "b8aab119-7997-428e-81ab-e6aa163f7acc",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "code_interpreter"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 36
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 46
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:02bc2\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:e40e6\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:200a9\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:e40e6\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:200a9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:e40e6\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:e40e6\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:e40e6\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:e40e6\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:e40e6\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
     "chunks": [
       {
@@ -23535,6 +27631,776 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:7bf28\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:b299f\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:af719\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:b299f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:af719\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:b299f\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:b299f\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:b299f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:b299f\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:b299f\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "To",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use LoRA in Torchtune, you",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " can follow these steps:\n\n1.  Install Torchtune",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " and its dependencies.\n2.  Download the Llama2",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " weights and tokenizer.\n3.  Use the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " `lora_llama2_7b` model",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " in Torchtune, which applies LoRA to the Q and",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " V projections by default.\n4.  Load",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the base model weights into the LoRA model without any conversion necessary",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".\n5.  Set only LoRA parameters to trainable.\n6",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".  Run the LoRA finetuning recipe",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " in Torchtune with the desired configuration.\n\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "You can also experiment with different LoRA configurations, such as applying",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " LoRA to all linear layers in the self-attention, increasing",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the rank, or scaling alpha and rank together.\n\nBy following these",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " steps, you can use LoRA in Torchtune to fine",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "-tune a Llama2 model with a",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " low memory footprint and achieve good performance.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 158
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 200
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 358
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:7bf28\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:b299f\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:af719\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:b299f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:af719\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\":",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " \"knowledge_search\", \"parameters",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\": {\"query\": \"How to use Lo",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "RA in Torchtune\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "How to use LoRA in Torchtune"
+                },
+                "call_id": "3d9a3bd1-4a05-4feb-b5a2-eed7a7a24f1b",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 117
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 40
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 157
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:7bf28\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:b299f\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:af719\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:b299f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:af719\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'m ready to help you answer questions about",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Torchtune based on the documentation you provided",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ". What's your first question?",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 75
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 35
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 110
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8c1f5\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:13786\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:f9c19\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:13786\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:f9c19\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:13786\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:13786\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:13786\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:13786\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:13786\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
     "chunks": [
       {
@@ -25953,6 +30819,1486 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:bbddb\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:15b86\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:83901\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:15b86\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:83901\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:15b86\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:15b86\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:15b86\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:15b86\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:15b86\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "To",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use LoRA in Torchtune, you",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " can follow these steps:\n\n1.  Install Torchtune",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " and its dependencies.\n2.  Download the Llama2",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " weights and tokenizer.\n3.  Use the `l",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "ora_llama2_7b` model in Torcht",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "une, which applies LoRA to the Q",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " and V projections by default.\n4.  Load the base",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " model weights into the LoRA model without",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " any conversion necessary.\n5.  Set only LoRA parameters",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " to trainable.\n6.  Run the LoRA finet",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "uning recipe in Torchtune with the desired",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " configuration.\n\nYou can also experiment with different LoRA configurations,",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " such as applying LoRA to all linear layers in the self",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "-attention, increasing the rank, or scaling alpha and rank",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " together.\n\nNote that LoRA can be beneficial for",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " reducing memory usage during fine-tuning, but it may also",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " impact model performance. You can trade off memory and model performance",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " by adjusting the LoRA configuration and running experiments with different settings",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 158
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 212
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 370
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:bbddb\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:15b86\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:83901\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:15b86\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:83901\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"knowledge_search\",",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " \"parameters\": {\"query\": \"How to use LoRA",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " in Torchtune\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "How to use LoRA in Torchtune"
+                },
+                "call_id": "548b1430-be4a-4c22-9430-62bda6dd150c",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 117
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 40
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 157
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:bbddb\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:15b86\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:83901\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:15b86\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:83901\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'m ready to help you answer questions about Torchtune based on",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the documentation you provided. What's your first question",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "?",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 75
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 35
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 110
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:da8ed\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:65275\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:f4ddd\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:65275\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:f4ddd\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:65275\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:65275\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:65275\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:65275\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:65275\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "To",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use LoRA in Torchtune, you can follow these steps:\n\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "1.  Install Torchtune and its dependencies.\n2. ",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Download the Llama2 weights and tokenizer.\n3.  Use the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " `lora_llama2_7b` model in Torcht",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "une, which applies LoRA to the Q and V projections by default",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".\n4.  Set the `lora_attn_modules` argument",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " to apply LoRA to all linear layers in the self-attention.\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "5.  Increase the rank and alpha values to experiment with different Lo",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "RA configurations.\n6.  Run the LoRA finetuning recipe",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " in Torchtune using the `lora_finetune_distributed",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "` command.\n7.  Monitor the loss curves and adjust the Lo",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "RA configuration as needed to trade off memory and model performance.\n\nBy following",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " these steps, you can effectively use LoRA in Torchtune to",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " fine-tune Llama2 models with a low memory footprint.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 158
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 206
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 364
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:da8ed\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:65275\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:f4ddd\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:65275\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:f4ddd\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"knowledge",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "_search\", \"parameters\": {\"query\": \"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "How to use LoRA in Torchtune\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "How to use LoRA in Torchtune"
+                },
+                "call_id": "b1a5c1c5-905e-4206-95f6-e30f9b07376d",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 117
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 40
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 157
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:da8ed\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:65275\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:f4ddd\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:65275\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:f4ddd\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'m ready to help you answer questions about Torcht",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "une based on the documentation you provided. What's your first",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " question?",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 75
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 35
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 110
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:ea3f6\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:5c435\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:91d52\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:5c435\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:91d52\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:5c435\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:5c435\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:5c435\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:5c435\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:5c435\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
     "chunks": [
       {
@@ -26655,6 +33001,716 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:fa9cd\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:6dc04\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:6f75f\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:6dc04\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:6f75f\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:6dc04\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:6dc04\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:6dc04\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:6dc04\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:6dc04\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "To",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use LoRA in Torchtune, you can follow these",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " steps:\n\n1.  Install Torchtune and its dependencies",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".\n2.  Download the Llama2 weights and tokenizer",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".\n3.  Use the `lora_llama2",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "_7b` model in Torchtune, which applies",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " LoRA to the Q and V projections by default.\n4",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".  Set the `lora_attn_modules` argument to",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " apply LoRA to all linear layers in the self-attention.\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "5.  Increase the `lora_rank` and `l",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "ora_alpha` arguments to improve model performance.\n6.  Run",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the LoRA finetuning recipe in Torchtune using the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " `lora_finetune_distributed`",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " command.\n\nBy following these steps, you can apply LoRA to",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " your Llama2 model and fine-tune it using Torcht",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "une.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 158
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 185
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 343
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:fa9cd\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:6dc04\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:6f75f\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:6dc04\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:6f75f\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"knowledge_search\", \"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "parameters\": {\"query\": \"How to use LoRA in",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Torchtune\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "How to use LoRA in Torchtune"
+                },
+                "call_id": "d4e8b8eb-a0be-4434-b270-48315bf20723",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 117
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 40
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 157
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:fa9cd\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:6dc04\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:6f75f\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:6dc04\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:6f75f\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'m ready to help you answer questions about",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Torchtune based on the documentation you provided",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ". What's your first question?",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 75
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 35
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 110
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
     "chunks": [
       {
@@ -26713,7 +33769,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",",
+              "tool_call": "{\"type\": \"function\", \"name\": \"",
               "type": "tool_call"
             },
             "event_type": {
@@ -26738,7 +33794,32 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " \"parameters\": {\"query\": \"Torchtune documentation\"}}",
+              "tool_call": "knowledge_search\", \"parameters\": {\"query\": \"Tor",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "chtune documentation\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -26767,7 +33848,7 @@
                 "arguments": {
                   "query": "Torchtune documentation"
                 },
-                "call_id": "8b279fd4-0479-48d4-8ef7-882004f994b2",
+                "call_id": "cf722fb9-6067-46ea-8534-852b7d364278",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -27080,6 +34161,256 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent:  instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\\n\\n\\n.. code-block:: bash\\n\\n    tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights <llama3_label>`\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n    <s>[INST] <<SYS>>\\n    You are a helpful, respectful, and honest assistant.\\n    <</SYS>>\\n\\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\\n\\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n    The Llama3 Base model uses a `different prompt template\\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n    Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n    sample = [\\n        {\\n            \\\"role\\\": \\\"system\\\",\\n            \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " attention type used by Llama3-8B",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " is grouped-query attention.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 80
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 26
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 106
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent:  instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\\n\\n\\n.. code-block:: bash\\n\\n    tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights <llama3_label>`\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n    <s>[INST] <<SYS>>\\n    You are a helpful, respectful, and honest assistant.\\n    <</SYS>>\\n\\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\\n\\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n    The Llama3 Base model uses a `different prompt template\\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n    Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n    sample = [\\n        {\\n            \\\"role\\\": \\\"system\\\",\\n            \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " attention type used by Llama3-8B",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " is grouped-query attention.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 80
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 26
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 106
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
     "chunks": [
       {
@@ -27128,7 +34459,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "    \"type\": \"function\",\n    \"name\": \"knowledge",
+              "text": "    \"type\": \"function\",\n    \"name\": \"knowledge_search\",\n",
               "type": "text"
             },
             "event_type": {
@@ -27148,7 +34479,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "_search\",\n    \"parameters\": {\n        \"query",
+              "text": "    \"parameters\": {\n        \"query\": \"L",
               "type": "text"
             },
             "event_type": {
@@ -27168,7 +34499,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "\": \"Llama3-8B attention type\"\n    }\n}",
+              "text": "lama3-8B attention type\"\n    }\n}",
               "type": "text"
             },
             "event_type": {
@@ -27197,7 +34528,7 @@
                 "arguments": {
                   "query": "Llama3-8B attention type"
                 },
-                "call_id": "34e18a7e-5c52-403b-9dd3-cdb3f7f8cb89",
+                "call_id": "9106bccf-d0c5-4b0a-9398-0b5972ada295",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -27318,7 +34649,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
+              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",",
               "type": "tool_call"
             },
             "event_type": {
@@ -27343,7 +34674,32 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\": {\"query\": \"Llama3-8B attention type\"}}",
+              "tool_call": " \"parameters\": {\"query\": \"Llama3-8",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "B attention type\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -27372,7 +34728,7 @@
                 "arguments": {
                   "query": "Llama3-8B attention type"
                 },
-                "call_id": "2c25de18-5466-4f97-bef1-b463abbbcd55",
+                "call_id": "768fe977-8297-42bd-90c3-b1dc07882ce0",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -28057,6 +35413,111 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the founder of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Meta founder\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"{\\\"query\\\": \\\"Meta founder\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.81595254, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.70726365, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\\\", \\\"score\\\": 0.467308, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta Platforms - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Meta_Platforms\\\", \\\"content\\\": \\\"Following a period of intense scrutiny and damaging whistleblower leaks, news started to emerge on October 21, 2021, about Facebook's plan to rebrand the company and change its name.[15][54] In the Q3 2021 Earnings Call on October 25, Mark Zuckerberg discussed the ongoing criticism of the company's social services and the way it operates, and pointed to the pivoting efforts to building the metaverse \\\\u2013 without mentioning the rebranding and the name change.[55] The metaverse vision and the name change from Facebook, Inc. to Meta Platforms was introduced at Facebook Connect on October 28, 2021.[16] Based on Facebook's PR campaign, the name change reflects the company's shifting long term focus of building the metaverse, a digital extension of the physical world by social media, virtual reality and augmented reality features.[16][56]\\\", \\\"score\\\": 0.14999175, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.03678684, \\\"raw_content\\\": null}]}\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " founder of Meta is Mark Zuckerberg.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 1220
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 18
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 1238
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the founder of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
     "chunks": [
       {
@@ -28115,7 +35576,32 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "brave_search.call(query=\"Meta founder\")",
+              "tool_call": "brave_search.call(query",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "=\"Meta founder\")",
               "type": "tool_call"
             },
             "event_type": {
@@ -28144,7 +35630,7 @@
                 "arguments": {
                   "query": "Meta founder"
                 },
-                "call_id": "3918bbe3-28b2-411d-be4a-f0eb2aa137e8",
+                "call_id": "b81c41ae-5eb7-41b7-b466-78eb25a91bb7",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -28686,6 +36172,461 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " function `get_boiling_point` is not able to",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " find the boiling point of polyjuice as",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " it is a fictional liquid from the Harry Potter series. The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " function is only able to find the boiling point of real liquids",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 70
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 56
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 126
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " function `get_boiling_point` is not able to find the boiling point",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " of polyjuice as it is not a real liquid.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 70
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 38
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 108
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " function `get_boiling_point` is not able to find the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " boiling point of polyjuice as",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " it is not a real liquid.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 70
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 38
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 108
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -28744,7 +36685,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"",
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point\",",
               "type": "tool_call"
             },
             "event_type": {
@@ -28769,32 +36710,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "get_boiling_point\", \"parameters\": {\"liquid_name\": \"",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "polyjuice\"}}",
+              "tool_call": " \"parameters\": {\"liquid_name\": \"polyjuice\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -28823,7 +36739,7 @@
                 "arguments": {
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "98ea49c2-7788-4fda-8d84-4a584eb1dd27",
+                "call_id": "b63f9b8c-c514-48bb-8e0f-788b29c1c106",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -28944,7 +36860,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"get",
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
               "type": "tool_call"
             },
             "event_type": {
@@ -28969,7 +36885,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "_boiling_point\", \"parameters\": {\"liquid_name\": \"polyju",
+              "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"polyjuice",
               "type": "tool_call"
             },
             "event_type": {
@@ -28994,7 +36910,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "ice\"}}",
+              "tool_call": "\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -29023,7 +36939,7 @@
                 "arguments": {
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "a9d6416d-a739-49b0-b7df-63694412a376",
+                "call_id": "ec121f44-66e0-47e8-971a-211142998c65",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -29134,7 +37050,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " couldn't find any information on the boiling point of",
+              "text": " couldn't find any information on the boiling point of Polyjuice",
               "type": "text"
             },
             "event_type": {
@@ -29154,7 +37070,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " Polyjuice. Polyjuice is a magical potion in the",
+              "text": ". Polyjuice is a magical potion in the Harry Potter series",
               "type": "text"
             },
             "event_type": {
@@ -29174,7 +37090,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " Harry Potter series that allows the drinker to transform into someone else",
+              "text": " that allows the drinker to transform into someone else.",
               "type": "text"
             },
             "event_type": {
@@ -29194,7 +37110,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": ". It's not a physical substance with a boiling point. If",
+              "text": " It's not a physical substance with a boiling point. If",
               "type": "text"
             },
             "event_type": {
@@ -29214,7 +37130,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " you have any other questions, I'd be happy to",
+              "text": " you have any other questions, I'd",
               "type": "text"
             },
             "event_type": {
@@ -29234,7 +37150,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " help.",
+              "text": " be happy to help.",
               "type": "text"
             },
             "event_type": {
@@ -29349,7 +37265,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"",
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
               "type": "tool_call"
             },
             "event_type": {
@@ -29374,7 +37290,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "get_boiling_point\", \"parameters\": {\"liquid_name\":",
+              "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"polyjuice",
               "type": "tool_call"
             },
             "event_type": {
@@ -29399,7 +37315,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " \"polyjuice\"}}",
+              "tool_call": "\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -29428,7 +37344,7 @@
                 "arguments": {
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "722b4201-b9ac-440f-80c4-0e9a7f0b5369",
+                "call_id": "1ca40c99-853b-44e3-ab2c-f194e3ed1b45",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -29685,6 +37601,111 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"def is_prime(n):\\n    if n <= 1:\\n        return False\\n    if n <= 3:\\n        return True\\n    if n % 2 == 0 or n % 3 == 0:\\n        return False\\n    i = 5\\n    while i * i <= n:\\n        if n % i == 0 or n % (i + 2) == 0:\\n            return False\\n        i += 6\\n    return True\\n\\ndef get_nth_prime(n):\\n    count = 0\\n    num = 2\\n    while True:\\n        if is_prime(num):\\n            count += 1\\n            if count == n:\\n                return num\\n        num += 1\\n\\nprint(get_nth_prime(100))\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stdout]\\n541\\n[/stdout]\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " 100th prime number is 541.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 217
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 20
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 237
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
     "chunks": [
       {
@@ -29743,7 +37764,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "def is_prime(n):\n    if n <= 1:\n        return False\n",
+              "tool_call": "def is_prime(n):\n    if n <= 1:\n        return False",
               "type": "tool_call"
             },
             "event_type": {
@@ -29768,7 +37789,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "    if n <= 3:\n        return True\n    if n % ",
+              "tool_call": "\n    if n <= 3:\n       ",
               "type": "tool_call"
             },
             "event_type": {
@@ -29793,7 +37814,32 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "2 == 0 or n % 3 == 0:\n        return False",
+              "tool_call": " return True\n    if n % 2 ==",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " 0 or n % 3 == 0:\n        return False",
               "type": "tool_call"
             },
             "event_type": {
@@ -29843,7 +37889,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " if n % i == 0 or n % (i + 2)",
+              "tool_call": " if n % i == 0 or n % (i + 2",
               "type": "tool_call"
             },
             "event_type": {
@@ -29868,7 +37914,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " == 0:\n            return False\n        i",
+              "tool_call": ") == 0:\n            return False\n        i += 6\n   ",
               "type": "tool_call"
             },
             "event_type": {
@@ -29893,7 +37939,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " += 6\n    return True",
+              "tool_call": " return True\n\ndef get_nth_prime(n):\n    count = 0\n   ",
               "type": "tool_call"
             },
             "event_type": {
@@ -29918,7 +37964,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\n\n",
+              "tool_call": " num = 2\n    while True:\n        if is_prime(num):\n           ",
               "type": "tool_call"
             },
             "event_type": {
@@ -29943,7 +37989,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "def get_nth_prime(n):\n    count = 0\n    num = 2\n    while True:\n        if is_prime(num):\n           ",
+              "tool_call": " count += 1\n            if count == n:\n                return num\n",
               "type": "tool_call"
             },
             "event_type": {
@@ -29968,57 +38014,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " count += 1\n            if count == n:\n                return",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": " num\n        num += 1\n\nprint(get_nth_prime(100",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "))",
+              "tool_call": "        num += 1\n\nprint(get_nth_prime(100))",
               "type": "tool_call"
             },
             "event_type": {
@@ -30047,7 +38043,7 @@
                 "arguments": {
                   "code": "def is_prime(n):\n    if n <= 1:\n        return False\n    if n <= 3:\n        return True\n    if n % 2 == 0 or n % 3 == 0:\n        return False\n    i = 5\n    while i * i <= n:\n        if n % i == 0 or n % (i + 2) == 0:\n            return False\n        i += 6\n    return True\n\ndef get_nth_prime(n):\n    count = 0\n    num = 2\n    while True:\n        if is_prime(num):\n            count += 1\n            if count == n:\n                return num\n        num += 1\n\nprint(get_nth_prime(100))"
                 },
-                "call_id": "229f89b4-d07d-4748-b8ae-f805cb52209e",
+                "call_id": "d8ece88b-7b3e-4f72-9555-5a928c27012c",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -30399,6 +38395,291 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity company founding date\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity company founding date\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "Per",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "plexity the company was founded in 2022.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 105
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 22
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 127
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity company founding date\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\": {\"query\": \"Perplexity company founding",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " date\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "Perplexity company founding date"
+                },
+                "call_id": "7f40db23-2182-4006-9234-4c5b7dac978f",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 67
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 37
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 104
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
     "chunks": [
       {
@@ -30511,7 +38792,7 @@
                 "arguments": {
                   "query": "Perplexity company founding date"
                 },
-                "call_id": "06c95bef-9b2d-4380-bf16-e1338bb7cf2c",
+                "call_id": "7f65affe-6ecb-4db5-b70f-71e05e28c310",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -30848,6 +39129,171 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"when was the nba created\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " NBA was created on August 3,",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " 1949, with the merger of the Basketball Association of",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " America (BAA) and the National Basketball League (NBL",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ").",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 65
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 45
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 110
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
     "chunks": [
       {
@@ -30906,7 +39352,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"",
+              "tool_call": "{\"type\": \"function\", \"name\": \"",
               "type": "tool_call"
             },
             "event_type": {
@@ -30931,7 +39377,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "name\": \"knowledge_search\", \"parameters\": {\"query\": \"",
+              "tool_call": "knowledge_search\", \"parameters\": {\"query\": \"when",
               "type": "tool_call"
             },
             "event_type": {
@@ -30956,7 +39402,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "when was the nba created\"}}",
+              "tool_call": " was the nba created\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -30985,7 +39431,7 @@
                 "arguments": {
                   "query": "when was the nba created"
                 },
-                "call_id": "b08bb4c0-c0a1-4063-b110-3947559e4061",
+                "call_id": "0f4d0151-e44c-443a-8101-e0ac92c9d45f",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.json b/tests/integration/fixtures/recorded_responses/invoke_tool.json
index 30a132904..8db8ad966 100644
--- a/tests/integration/fixtures/recorded_responses/invoke_tool.json
+++ b/tests/integration/fixtures/recorded_responses/invoke_tool.json
@@ -167,23 +167,23 @@
             "type": "text"
           },
           {
-            "text": "Result 1:\nDocument_id:cc255\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+            "text": "Result 1:\nDocument_id:15b86\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
             "type": "text"
           },
           {
-            "text": "Result 2:\nDocument_id:cc255\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
+            "text": "Result 2:\nDocument_id:15b86\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
             "type": "text"
           },
           {
-            "text": "Result 3:\nDocument_id:cc255\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+            "text": "Result 3:\nDocument_id:15b86\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
             "type": "text"
           },
           {
-            "text": "Result 4:\nDocument_id:cc255\nContent:  from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n  # Assuming that base_model already has the pretrained Llama2 weights,\n  # this will directly load them into your LoRA model without any conversion necessary.\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n  # Fetch all params from the model that are associated with LoRA.\n  lora_params = get_adapter_params(lora_model)\n\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n  set_trainable_params(lora_model, lora_params)\n\n  # Print the total number of parameters\n  total_params = sum([p.numel() for p in lora_model.parameters()])\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n  print(\n    f\"\"\"\n    {total_params} total params,\n    {trainable_params}\" trainable params,\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n    \"\"\"\n  )\n\n  6742609920 total params,\n  4194304 trainable params,\n  0.06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
+            "text": "Result 4:\nDocument_id:15b86\nContent:  from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n  # Assuming that base_model already has the pretrained Llama2 weights,\n  # this will directly load them into your LoRA model without any conversion necessary.\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n  # Fetch all params from the model that are associated with LoRA.\n  lora_params = get_adapter_params(lora_model)\n\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n  set_trainable_params(lora_model, lora_params)\n\n  # Print the total number of parameters\n  total_params = sum([p.numel() for p in lora_model.parameters()])\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n  print(\n    f\"\"\"\n    {total_params} total params,\n    {trainable_params}\" trainable params,\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n    \"\"\"\n  )\n\n  6742609920 total params,\n  4194304 trainable params,\n  0.06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
             "type": "text"
           },
           {
-            "text": "Result 5:\nDocument_id:cc255\nContent: ,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
+            "text": "Result 5:\nDocument_id:15b86\nContent: ,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
             "type": "text"
           },
           {
@@ -195,11 +195,11 @@
         "error_message": null,
         "metadata": {
           "document_ids": [
-            "cc2559a9-2b56-43d8-9ec4-b2181bb96acb",
-            "cc2559a9-2b56-43d8-9ec4-b2181bb96acb",
-            "cc2559a9-2b56-43d8-9ec4-b2181bb96acb",
-            "cc2559a9-2b56-43d8-9ec4-b2181bb96acb",
-            "cc2559a9-2b56-43d8-9ec4-b2181bb96acb"
+            "15b8638f-b1b6-4f58-adfa-eb6644c47de3",
+            "15b8638f-b1b6-4f58-adfa-eb6644c47de3",
+            "15b8638f-b1b6-4f58-adfa-eb6644c47de3",
+            "15b8638f-b1b6-4f58-adfa-eb6644c47de3",
+            "15b8638f-b1b6-4f58-adfa-eb6644c47de3"
           ]
         }
       }
@@ -400,23 +400,23 @@
             "type": "text"
           },
           {
-            "text": "Result 1:\nDocument_id:16a6a\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
+            "text": "Result 1:\nDocument_id:bbddb\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
             "type": "text"
           },
           {
-            "text": "Result 2:\nDocument_id:cc255\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+            "text": "Result 2:\nDocument_id:15b86\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
             "type": "text"
           },
           {
-            "text": "Result 3:\nDocument_id:7a06a\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+            "text": "Result 3:\nDocument_id:83901\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
             "type": "text"
           },
           {
-            "text": "Result 4:\nDocument_id:cc255\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+            "text": "Result 4:\nDocument_id:15b86\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
             "type": "text"
           },
           {
-            "text": "Result 5:\nDocument_id:7a06a\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+            "text": "Result 5:\nDocument_id:83901\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
             "type": "text"
           },
           {
@@ -428,11 +428,11 @@
         "error_message": null,
         "metadata": {
           "document_ids": [
-            "16a6ae01-049e-4a44-b305-8248d20a8f7d",
-            "cc2559a9-2b56-43d8-9ec4-b2181bb96acb",
-            "7a06a3a9-7e9d-4693-8c07-15343f0654aa",
-            "cc2559a9-2b56-43d8-9ec4-b2181bb96acb",
-            "7a06a3a9-7e9d-4693-8c07-15343f0654aa"
+            "bbddbe62-508d-4c8d-9455-3b60bc2825a5",
+            "15b8638f-b1b6-4f58-adfa-eb6644c47de3",
+            "83901b53-33d4-4f5e-8145-b94c783e9f61",
+            "15b8638f-b1b6-4f58-adfa-eb6644c47de3",
+            "83901b53-33d4-4f5e-8145-b94c783e9f61"
           ]
         }
       }

From d263edbf90f958349c7b9adea8fd4e5181932a69 Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <31703736+nathan-weinberg@users.noreply.github.com>
Date: Wed, 12 Mar 2025 23:08:24 -0400
Subject: [PATCH 10/38] build: remove .python-version (#1513)

# What does this PR do?
the current `.python-version` file forces `uv` to
setup the development environment with Python 3.10

this causes an error if a dev system does not have
Python 3.10, even though the project officially
supports newer versions of Python as well

since `uv` can use the `pyproject.toml` to determine
python versions, we can safely remove this file from
the repo and subsequent git tracking

follows up on https://github.com/meta-llama/llama-stack/pull/1172

## Test Plan
N/A

---------

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .gitignore      | 1 +
 .python-version | 1 -
 CONTRIBUTING.md | 6 ++++++
 3 files changed, 7 insertions(+), 1 deletion(-)
 delete mode 100644 .python-version

diff --git a/.gitignore b/.gitignore
index 1b15107f3..0ef25cdf1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,3 +22,4 @@ pyrightconfig.json
 venv/
 pytest-report.xml
 .coverage
+.python-version
diff --git a/.python-version b/.python-version
deleted file mode 100644
index c8cfe3959..000000000
--- a/.python-version
+++ /dev/null
@@ -1 +0,0 @@
-3.10
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7c0b5d94e..71e610064 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -61,6 +61,7 @@ outlined on that page and do not file a public issue.
 
 We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
 You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).
+
 You can install the dependencies by running:
 
 ```bash
@@ -70,6 +71,11 @@ uv pip install -e .
 source .venv/bin/activate
 ```
 
+> [!NOTE]
+> You can pin a specific version of Python to use for `uv` by adding a `.python-version` file in the root project directory.
+> Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
+> For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
+
 Note that you can create a dotenv file `.env` that includes necessary environment variables:
 ```
 LLAMA_STACK_BASE_URL=http://localhost:8321

From 0a0d6cb96e9653aa24998fd4a6de01063c14317e Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Wed, 12 Mar 2025 21:55:05 -0700
Subject: [PATCH 11/38] fix: openapi spec gen (#1602)

Summary:

Test Plan:
sh docs/openapi_generator/run_openapi_generator.sh
---
 docs/_static/llama-stack-spec.html | 20 --------------------
 docs/_static/llama-stack-spec.yaml | 16 ----------------
 2 files changed, 36 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index c50554092..22fa781ac 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -4654,22 +4654,12 @@
             "CompletionResponse": {
                 "type": "object",
                 "properties": {
-<<<<<<< dest:   ed6caead724a - ehhuang: chore: simplify _get_tool_defs (#1384)
                     "metrics": {
                         "type": "array",
                         "items": {
                             "$ref": "#/components/schemas/MetricInResponse"
                         }
                     },
-||||||| base:   1311faf3f5e7 - ehhuang: fix: logging (#1598)
-=======
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/MetricEvent"
-                        }
-                    },
->>>>>>> source: ad32270ad0d5 - erichuang: feat(api): remove tool_name from To...
                     "content": {
                         "type": "string",
                         "description": "The generated completion text"
@@ -5037,22 +5027,12 @@
             "CompletionResponseStreamChunk": {
                 "type": "object",
                 "properties": {
-<<<<<<< dest:   ed6caead724a - ehhuang: chore: simplify _get_tool_defs (#1384)
                     "metrics": {
                         "type": "array",
                         "items": {
                             "$ref": "#/components/schemas/MetricInResponse"
                         }
                     },
-||||||| base:   1311faf3f5e7 - ehhuang: fix: logging (#1598)
-=======
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/MetricEvent"
-                        }
-                    },
->>>>>>> source: ad32270ad0d5 - erichuang: feat(api): remove tool_name from To...
                     "delta": {
                         "type": "string",
                         "description": "New content generated since last chunk. This can be one or more tokens."
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 1f9536c2e..1f01351e9 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -3176,18 +3176,10 @@ components:
     CompletionResponse:
       type: object
       properties:
-<<<<<<< dest:   ed6caead724a - ehhuang: chore: simplify _get_tool_defs (#1384)
         metrics:
           type: array
           items:
             $ref: '#/components/schemas/MetricInResponse'
-||||||| base:   1311faf3f5e7 - ehhuang: fix: logging (#1598)
-=======
-        metrics:
-          type: array
-          items:
-            $ref: '#/components/schemas/MetricEvent'
->>>>>>> source: ad32270ad0d5 - erichuang: feat(api): remove tool_name from To...
         content:
           type: string
           description: The generated completion text
@@ -3506,18 +3498,10 @@ components:
     CompletionResponseStreamChunk:
       type: object
       properties:
-<<<<<<< dest:   ed6caead724a - ehhuang: chore: simplify _get_tool_defs (#1384)
         metrics:
           type: array
           items:
             $ref: '#/components/schemas/MetricInResponse'
-||||||| base:   1311faf3f5e7 - ehhuang: fix: logging (#1598)
-=======
-        metrics:
-          type: array
-          items:
-            $ref: '#/components/schemas/MetricEvent'
->>>>>>> source: ad32270ad0d5 - erichuang: feat(api): remove tool_name from To...
         delta:
           type: string
           description: >-

From d072b5fa0c2b58dcf27e936bb4fa3a95bc6f41d7 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Wed, 12 Mar 2025 22:29:58 -0700
Subject: [PATCH 12/38] test: add unit test to ensure all config types are
 instantiable (#1601)

---
 .../inline/datasetio/localfs/config.py        | 16 ++++--
 .../inline/eval/meta_reference/config.py      | 16 ++++--
 .../providers/inline/inference/vllm/config.py |  4 +-
 .../inline/post_training/torchtune/config.py  |  8 ++-
 .../inline/safety/code_scanner/config.py      |  6 ++-
 .../inline/safety/llama_guard/config.py       |  8 ++-
 .../inline/safety/prompt_guard/config.py      |  7 +++
 .../providers/inline/scoring/basic/config.py  |  7 ++-
 .../inline/scoring/llm_as_judge/config.py     |  7 ++-
 .../inline/telemetry/sample/__init__.py       | 17 -------
 .../inline/telemetry/sample/config.py         | 12 -----
 .../inline/telemetry/sample/sample.py         | 17 -------
 .../tool_runtime/code_interpreter/config.py   |  6 ++-
 .../inline/tool_runtime/rag/config.py         |  6 ++-
 .../inline/vector_io/chroma/config.py         |  4 +-
 llama_stack/providers/registry/agents.py      | 11 ----
 llama_stack/providers/registry/inference.py   |  9 ----
 llama_stack/providers/registry/safety.py      | 30 -----------
 llama_stack/providers/registry/telemetry.py   | 11 ----
 llama_stack/providers/registry/vector_io.py   | 10 ----
 .../remote/agents/sample/__init__.py          | 17 -------
 .../providers/remote/agents/sample/config.py  | 12 -----
 .../providers/remote/agents/sample/sample.py  | 17 -------
 .../remote/datasetio/huggingface/config.py    | 16 ++++--
 .../remote/inference/databricks/config.py     | 13 +++++
 .../remote/inference/runpod/__init__.py       |  3 +-
 .../remote/inference/runpod/config.py         |  9 +++-
 .../remote/inference/runpod/runpod.py         |  1 -
 .../remote/inference/sample/__init__.py       | 17 -------
 .../remote/inference/sample/config.py         | 12 -----
 .../remote/inference/sample/sample.py         | 23 ---------
 .../remote/safety/sample/__init__.py          | 17 -------
 .../providers/remote/safety/sample/config.py  | 12 -----
 .../providers/remote/safety/sample/sample.py  | 23 ---------
 .../remote/tool_runtime/bing_search/config.py |  8 ++-
 .../model_context_protocol/config.py          |  6 ++-
 .../tool_runtime/wolfram_alpha/config.py      |  8 ++-
 .../remote/vector_io/qdrant/config.py         |  8 ++-
 .../remote/vector_io/sample/__init__.py       | 17 -------
 .../remote/vector_io/sample/config.py         | 12 -----
 .../remote/vector_io/sample/sample.py         | 26 ----------
 .../remote/vector_io/weaviate/config.py       |  6 ++-
 llama_stack/templates/bedrock/run.yaml        | 18 +++++--
 llama_stack/templates/cerebras/run.yaml       | 21 ++++++--
 llama_stack/templates/ci-tests/run.yaml       | 21 ++++++--
 .../templates/dell/run-with-safety.yaml       | 21 ++++++--
 llama_stack/templates/dell/run.yaml           | 21 ++++++--
 llama_stack/templates/dev/run.yaml            | 21 ++++++--
 .../templates/fireworks/run-with-safety.yaml  | 21 ++++++--
 llama_stack/templates/fireworks/run.yaml      | 24 +++++++--
 llama_stack/templates/groq/run.yaml           | 21 ++++++--
 .../hf-endpoint/run-with-safety.yaml          | 21 ++++++--
 llama_stack/templates/hf-endpoint/run.yaml    | 21 ++++++--
 .../hf-serverless/run-with-safety.yaml        | 21 ++++++--
 llama_stack/templates/hf-serverless/run.yaml  | 21 ++++++--
 .../meta-reference-gpu/run-with-safety.yaml   | 21 ++++++--
 .../templates/meta-reference-gpu/run.yaml     | 21 ++++++--
 .../meta-reference-quantized-gpu/run.yaml     | 21 ++++++--
 llama_stack/templates/nvidia/run.yaml         | 21 ++++++--
 .../templates/ollama/run-with-safety.yaml     | 21 ++++++--
 llama_stack/templates/ollama/run.yaml         | 24 +++++++--
 llama_stack/templates/open-benchmark/run.yaml | 21 ++++++--
 .../remote-vllm/run-with-safety.yaml          | 24 +++++++--
 llama_stack/templates/remote-vllm/run.yaml    | 24 +++++++--
 llama_stack/templates/sambanova/run.yaml      |  3 +-
 .../templates/tgi/run-with-safety.yaml        | 21 ++++++--
 llama_stack/templates/tgi/run.yaml            | 21 ++++++--
 .../templates/together/run-with-safety.yaml   | 21 ++++++--
 llama_stack/templates/together/run.yaml       | 24 +++++++--
 llama_stack/templates/vllm-gpu/run.yaml       | 21 ++++++--
 tests/unit/providers/test_configs.py          | 50 +++++++++++++++++++
 71 files changed, 662 insertions(+), 465 deletions(-)
 delete mode 100644 llama_stack/providers/inline/telemetry/sample/__init__.py
 delete mode 100644 llama_stack/providers/inline/telemetry/sample/config.py
 delete mode 100644 llama_stack/providers/inline/telemetry/sample/sample.py
 delete mode 100644 llama_stack/providers/remote/agents/sample/__init__.py
 delete mode 100644 llama_stack/providers/remote/agents/sample/config.py
 delete mode 100644 llama_stack/providers/remote/agents/sample/sample.py
 delete mode 100644 llama_stack/providers/remote/inference/sample/__init__.py
 delete mode 100644 llama_stack/providers/remote/inference/sample/config.py
 delete mode 100644 llama_stack/providers/remote/inference/sample/sample.py
 delete mode 100644 llama_stack/providers/remote/safety/sample/__init__.py
 delete mode 100644 llama_stack/providers/remote/safety/sample/config.py
 delete mode 100644 llama_stack/providers/remote/safety/sample/sample.py
 delete mode 100644 llama_stack/providers/remote/vector_io/sample/__init__.py
 delete mode 100644 llama_stack/providers/remote/vector_io/sample/config.py
 delete mode 100644 llama_stack/providers/remote/vector_io/sample/sample.py
 create mode 100644 tests/unit/providers/test_configs.py

diff --git a/llama_stack/providers/inline/datasetio/localfs/config.py b/llama_stack/providers/inline/datasetio/localfs/config.py
index f4f495b95..d74521f1f 100644
--- a/llama_stack/providers/inline/datasetio/localfs/config.py
+++ b/llama_stack/providers/inline/datasetio/localfs/config.py
@@ -3,9 +3,10 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Any, Dict
+
 from pydantic import BaseModel
 
-from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR
 from llama_stack.providers.utils.kvstore.config import (
     KVStoreConfig,
     SqliteKVStoreConfig,
@@ -13,6 +14,13 @@ from llama_stack.providers.utils.kvstore.config import (
 
 
 class LocalFSDatasetIOConfig(BaseModel):
-    kvstore: KVStoreConfig = SqliteKVStoreConfig(
-        db_path=(RUNTIME_BASE_DIR / "localfs_datasetio.db").as_posix()
-    )  # Uses SQLite config specific to localfs storage
+    kvstore: KVStoreConfig
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "kvstore": SqliteKVStoreConfig.sample_run_config(
+                __distro_dir__=__distro_dir__,
+                db_name="localfs_datasetio.db",
+            )
+        }
diff --git a/llama_stack/providers/inline/eval/meta_reference/config.py b/llama_stack/providers/inline/eval/meta_reference/config.py
index 95b780cca..5b2bec259 100644
--- a/llama_stack/providers/inline/eval/meta_reference/config.py
+++ b/llama_stack/providers/inline/eval/meta_reference/config.py
@@ -3,9 +3,10 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Any, Dict
+
 from pydantic import BaseModel
 
-from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR
 from llama_stack.providers.utils.kvstore.config import (
     KVStoreConfig,
     SqliteKVStoreConfig,
@@ -13,6 +14,13 @@ from llama_stack.providers.utils.kvstore.config import (
 
 
 class MetaReferenceEvalConfig(BaseModel):
-    kvstore: KVStoreConfig = SqliteKVStoreConfig(
-        db_path=(RUNTIME_BASE_DIR / "meta_reference_eval.db").as_posix()
-    )  # Uses SQLite config specific to Meta Reference Eval storage
+    kvstore: KVStoreConfig
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "kvstore": SqliteKVStoreConfig.sample_run_config(
+                __distro_dir__=__distro_dir__,
+                db_name="meta_reference_eval.db",
+            )
+        }
diff --git a/llama_stack/providers/inline/inference/vllm/config.py b/llama_stack/providers/inline/inference/vllm/config.py
index 0e85c9a48..51d48e6d5 100644
--- a/llama_stack/providers/inline/inference/vllm/config.py
+++ b/llama_stack/providers/inline/inference/vllm/config.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from typing import Any, Dict
+
 from pydantic import BaseModel, Field
 
 from llama_stack.schema_utils import json_schema_type
@@ -40,7 +42,7 @@ class VLLMConfig(BaseModel):
     )
 
     @classmethod
-    def sample_run_config(cls):
+    def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]:
         return {
             "tensor_parallel_size": "${env.TENSOR_PARALLEL_SIZE:1}",
             "max_tokens": "${env.MAX_TOKENS:4096}",
diff --git a/llama_stack/providers/inline/post_training/torchtune/config.py b/llama_stack/providers/inline/post_training/torchtune/config.py
index 2f48ddfad..ee3504f9e 100644
--- a/llama_stack/providers/inline/post_training/torchtune/config.py
+++ b/llama_stack/providers/inline/post_training/torchtune/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Literal, Optional
+from typing import Any, Dict, Literal, Optional
 
 from pydantic import BaseModel
 
@@ -12,3 +12,9 @@ from pydantic import BaseModel
 class TorchtunePostTrainingConfig(BaseModel):
     torch_seed: Optional[int] = None
     checkpoint_format: Optional[Literal["meta", "huggingface"]] = "meta"
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "checkpoint_format": "meta",
+        }
diff --git a/llama_stack/providers/inline/safety/code_scanner/config.py b/llama_stack/providers/inline/safety/code_scanner/config.py
index 75c90d69a..1d880ee9c 100644
--- a/llama_stack/providers/inline/safety/code_scanner/config.py
+++ b/llama_stack/providers/inline/safety/code_scanner/config.py
@@ -4,8 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from typing import Any, Dict
+
 from pydantic import BaseModel
 
 
 class CodeScannerConfig(BaseModel):
-    pass
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {}
diff --git a/llama_stack/providers/inline/safety/llama_guard/config.py b/llama_stack/providers/inline/safety/llama_guard/config.py
index 72036fd1c..53849ab33 100644
--- a/llama_stack/providers/inline/safety/llama_guard/config.py
+++ b/llama_stack/providers/inline/safety/llama_guard/config.py
@@ -4,10 +4,16 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List
+from typing import Any, Dict, List
 
 from pydantic import BaseModel
 
 
 class LlamaGuardConfig(BaseModel):
     excluded_categories: List[str] = []
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "excluded_categories": [],
+        }
diff --git a/llama_stack/providers/inline/safety/prompt_guard/config.py b/llama_stack/providers/inline/safety/prompt_guard/config.py
index bddd28452..76bd5978d 100644
--- a/llama_stack/providers/inline/safety/prompt_guard/config.py
+++ b/llama_stack/providers/inline/safety/prompt_guard/config.py
@@ -5,6 +5,7 @@
 # the root directory of this source tree.
 
 from enum import Enum
+from typing import Any, Dict
 
 from pydantic import BaseModel, field_validator
 
@@ -23,3 +24,9 @@ class PromptGuardConfig(BaseModel):
         if v not in [t.value for t in PromptGuardType]:
             raise ValueError(f"Unknown prompt guard type: {v}")
         return v
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "guard_type": "injection",
+        }
diff --git a/llama_stack/providers/inline/scoring/basic/config.py b/llama_stack/providers/inline/scoring/basic/config.py
index d9dbe71bc..5866be359 100644
--- a/llama_stack/providers/inline/scoring/basic/config.py
+++ b/llama_stack/providers/inline/scoring/basic/config.py
@@ -3,7 +3,12 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Any, Dict
+
 from pydantic import BaseModel
 
 
-class BasicScoringConfig(BaseModel): ...
+class BasicScoringConfig(BaseModel):
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {}
diff --git a/llama_stack/providers/inline/scoring/llm_as_judge/config.py b/llama_stack/providers/inline/scoring/llm_as_judge/config.py
index 1b538420c..ff63fc5e7 100644
--- a/llama_stack/providers/inline/scoring/llm_as_judge/config.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/config.py
@@ -3,7 +3,12 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Any, Dict
+
 from pydantic import BaseModel
 
 
-class LlmAsJudgeScoringConfig(BaseModel): ...
+class LlmAsJudgeScoringConfig(BaseModel):
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {}
diff --git a/llama_stack/providers/inline/telemetry/sample/__init__.py b/llama_stack/providers/inline/telemetry/sample/__init__.py
deleted file mode 100644
index 4fb27ac27..000000000
--- a/llama_stack/providers/inline/telemetry/sample/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from .config import SampleConfig
-
-
-async def get_adapter_impl(config: SampleConfig, _deps) -> Any:
-    from .sample import SampleTelemetryImpl
-
-    impl = SampleTelemetryImpl(config)
-    await impl.initialize()
-    return impl
diff --git a/llama_stack/providers/inline/telemetry/sample/config.py b/llama_stack/providers/inline/telemetry/sample/config.py
deleted file mode 100644
index 4b7404a26..000000000
--- a/llama_stack/providers/inline/telemetry/sample/config.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from pydantic import BaseModel
-
-
-class SampleConfig(BaseModel):
-    host: str = "localhost"
-    port: int = 9999
diff --git a/llama_stack/providers/inline/telemetry/sample/sample.py b/llama_stack/providers/inline/telemetry/sample/sample.py
deleted file mode 100644
index a4147a1b2..000000000
--- a/llama_stack/providers/inline/telemetry/sample/sample.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.apis.telemetry import Telemetry
-
-from .config import SampleConfig
-
-
-class SampleTelemetryImpl(Telemetry):
-    def __init__(self, config: SampleConfig):
-        self.config = config
-
-    async def initialize(self):
-        pass
diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/config.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/config.py
index 167a2c318..7de1ec453 100644
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/config.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/config.py
@@ -4,8 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from typing import Any, Dict
+
 from pydantic import BaseModel
 
 
 class CodeInterpreterToolConfig(BaseModel):
-    pass
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {}
diff --git a/llama_stack/providers/inline/tool_runtime/rag/config.py b/llama_stack/providers/inline/tool_runtime/rag/config.py
index 2d0d2f595..c75c3fc51 100644
--- a/llama_stack/providers/inline/tool_runtime/rag/config.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/config.py
@@ -4,8 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from typing import Any, Dict
+
 from pydantic import BaseModel
 
 
 class RagToolRuntimeConfig(BaseModel):
-    pass
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {}
diff --git a/llama_stack/providers/inline/vector_io/chroma/config.py b/llama_stack/providers/inline/vector_io/chroma/config.py
index a1fb60fa6..1e333fe92 100644
--- a/llama_stack/providers/inline/vector_io/chroma/config.py
+++ b/llama_stack/providers/inline/vector_io/chroma/config.py
@@ -13,5 +13,5 @@ class ChromaVectorIOConfig(BaseModel):
     db_path: str
 
     @classmethod
-    def sample_config(cls) -> Dict[str, Any]:
-        return {"db_path": "{env.CHROMADB_PATH}"}
+    def sample_run_config(cls, db_path: str = "${env.CHROMADB_PATH}", **kwargs: Any) -> Dict[str, Any]:
+        return {"db_path": db_path}
diff --git a/llama_stack/providers/registry/agents.py b/llama_stack/providers/registry/agents.py
index 655303f98..3ed59304d 100644
--- a/llama_stack/providers/registry/agents.py
+++ b/llama_stack/providers/registry/agents.py
@@ -7,11 +7,9 @@
 from typing import List
 
 from llama_stack.providers.datatypes import (
-    AdapterSpec,
     Api,
     InlineProviderSpec,
     ProviderSpec,
-    remote_provider_spec,
 )
 from llama_stack.providers.utils.kvstore import kvstore_dependencies
 
@@ -39,13 +37,4 @@ def available_providers() -> List[ProviderSpec]:
                 Api.tool_groups,
             ],
         ),
-        remote_provider_spec(
-            api=Api.agents,
-            adapter=AdapterSpec(
-                adapter_type="sample",
-                pip_packages=[],
-                module="llama_stack.providers.remote.agents.sample",
-                config_class="llama_stack.providers.remote.agents.sample.SampleConfig",
-            ),
-        ),
     ]
diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py
index d5f095740..ca4dc59f7 100644
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@@ -68,15 +68,6 @@ def available_providers() -> List[ProviderSpec]:
             module="llama_stack.providers.inline.inference.sentence_transformers",
             config_class="llama_stack.providers.inline.inference.sentence_transformers.config.SentenceTransformersInferenceConfig",
         ),
-        remote_provider_spec(
-            api=Api.inference,
-            adapter=AdapterSpec(
-                adapter_type="sample",
-                pip_packages=[],
-                module="llama_stack.providers.remote.inference.sample",
-                config_class="llama_stack.providers.remote.inference.sample.SampleConfig",
-            ),
-        ),
         remote_provider_spec(
             api=Api.inference,
             adapter=AdapterSpec(
diff --git a/llama_stack/providers/registry/safety.py b/llama_stack/providers/registry/safety.py
index b9f7b6d78..1364352e6 100644
--- a/llama_stack/providers/registry/safety.py
+++ b/llama_stack/providers/registry/safety.py
@@ -27,27 +27,6 @@ def available_providers() -> List[ProviderSpec]:
             module="llama_stack.providers.inline.safety.prompt_guard",
             config_class="llama_stack.providers.inline.safety.prompt_guard.PromptGuardConfig",
         ),
-        InlineProviderSpec(
-            api=Api.safety,
-            provider_type="inline::meta-reference",
-            pip_packages=[
-                "transformers",
-                "torch --index-url https://download.pytorch.org/whl/cpu",
-            ],
-            module="llama_stack.providers.inline.safety.meta_reference",
-            config_class="llama_stack.providers.inline.safety.meta_reference.SafetyConfig",
-            api_dependencies=[
-                Api.inference,
-            ],
-            deprecation_error="""
-Provider `inline::meta-reference` for API `safety` does not work with the latest Llama Stack.
-
-- if you are using Llama Guard v3, please use the `inline::llama-guard` provider instead.
-- if you are using Prompt Guard, please use the `inline::prompt-guard` provider instead.
-- if you are using Code Scanner, please use the `inline::code-scanner` provider instead.
-
-            """,
-        ),
         InlineProviderSpec(
             api=Api.safety,
             provider_type="inline::llama-guard",
@@ -67,15 +46,6 @@ Provider `inline::meta-reference` for API `safety` does not work with the latest
             module="llama_stack.providers.inline.safety.code_scanner",
             config_class="llama_stack.providers.inline.safety.code_scanner.CodeScannerConfig",
         ),
-        remote_provider_spec(
-            api=Api.safety,
-            adapter=AdapterSpec(
-                adapter_type="sample",
-                pip_packages=[],
-                module="llama_stack.providers.remote.safety.sample",
-                config_class="llama_stack.providers.remote.safety.sample.SampleConfig",
-            ),
-        ),
         remote_provider_spec(
             api=Api.safety,
             adapter=AdapterSpec(
diff --git a/llama_stack/providers/registry/telemetry.py b/llama_stack/providers/registry/telemetry.py
index f3b41374c..fc249f3e2 100644
--- a/llama_stack/providers/registry/telemetry.py
+++ b/llama_stack/providers/registry/telemetry.py
@@ -7,11 +7,9 @@
 from typing import List
 
 from llama_stack.providers.datatypes import (
-    AdapterSpec,
     Api,
     InlineProviderSpec,
     ProviderSpec,
-    remote_provider_spec,
 )
 
 
@@ -28,13 +26,4 @@ def available_providers() -> List[ProviderSpec]:
             module="llama_stack.providers.inline.telemetry.meta_reference",
             config_class="llama_stack.providers.inline.telemetry.meta_reference.config.TelemetryConfig",
         ),
-        remote_provider_spec(
-            api=Api.telemetry,
-            adapter=AdapterSpec(
-                adapter_type="sample",
-                pip_packages=[],
-                module="llama_stack.providers.remote.telemetry.sample",
-                config_class="llama_stack.providers.remote.telemetry.sample.SampleConfig",
-            ),
-        ),
     ]
diff --git a/llama_stack/providers/registry/vector_io.py b/llama_stack/providers/registry/vector_io.py
index 8471748d8..fbc495d83 100644
--- a/llama_stack/providers/registry/vector_io.py
+++ b/llama_stack/providers/registry/vector_io.py
@@ -92,16 +92,6 @@ def available_providers() -> List[ProviderSpec]:
             ),
             api_dependencies=[Api.inference],
         ),
-        remote_provider_spec(
-            api=Api.vector_io,
-            adapter=AdapterSpec(
-                adapter_type="sample",
-                pip_packages=[],
-                module="llama_stack.providers.remote.vector_io.sample",
-                config_class="llama_stack.providers.remote.vector_io.sample.SampleVectorIOConfig",
-            ),
-            api_dependencies=[],
-        ),
         remote_provider_spec(
             Api.vector_io,
             AdapterSpec(
diff --git a/llama_stack/providers/remote/agents/sample/__init__.py b/llama_stack/providers/remote/agents/sample/__init__.py
deleted file mode 100644
index 94456d98b..000000000
--- a/llama_stack/providers/remote/agents/sample/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from .config import SampleConfig
-
-
-async def get_adapter_impl(config: SampleConfig, _deps) -> Any:
-    from .sample import SampleAgentsImpl
-
-    impl = SampleAgentsImpl(config)
-    await impl.initialize()
-    return impl
diff --git a/llama_stack/providers/remote/agents/sample/config.py b/llama_stack/providers/remote/agents/sample/config.py
deleted file mode 100644
index 4b7404a26..000000000
--- a/llama_stack/providers/remote/agents/sample/config.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from pydantic import BaseModel
-
-
-class SampleConfig(BaseModel):
-    host: str = "localhost"
-    port: int = 9999
diff --git a/llama_stack/providers/remote/agents/sample/sample.py b/llama_stack/providers/remote/agents/sample/sample.py
deleted file mode 100644
index 02e889496..000000000
--- a/llama_stack/providers/remote/agents/sample/sample.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.apis.agents import Agents
-
-from .config import SampleConfig
-
-
-class SampleAgentsImpl(Agents):
-    def __init__(self, config: SampleConfig):
-        self.config = config
-
-    async def initialize(self):
-        pass
diff --git a/llama_stack/providers/remote/datasetio/huggingface/config.py b/llama_stack/providers/remote/datasetio/huggingface/config.py
index 1cdae0625..c06996b6f 100644
--- a/llama_stack/providers/remote/datasetio/huggingface/config.py
+++ b/llama_stack/providers/remote/datasetio/huggingface/config.py
@@ -3,9 +3,10 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Any, Dict
+
 from pydantic import BaseModel
 
-from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR
 from llama_stack.providers.utils.kvstore.config import (
     KVStoreConfig,
     SqliteKVStoreConfig,
@@ -13,6 +14,13 @@ from llama_stack.providers.utils.kvstore.config import (
 
 
 class HuggingfaceDatasetIOConfig(BaseModel):
-    kvstore: KVStoreConfig = SqliteKVStoreConfig(
-        db_path=(RUNTIME_BASE_DIR / "huggingface_datasetio.db").as_posix()
-    )  # Uses SQLite config specific to HF storage
+    kvstore: KVStoreConfig
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "kvstore": SqliteKVStoreConfig.sample_run_config(
+                __distro_dir__=__distro_dir__,
+                db_name="huggingface_datasetio.db",
+            )
+        }
diff --git a/llama_stack/providers/remote/inference/databricks/config.py b/llama_stack/providers/remote/inference/databricks/config.py
index 6aaf7e594..1d51125cb 100644
--- a/llama_stack/providers/remote/inference/databricks/config.py
+++ b/llama_stack/providers/remote/inference/databricks/config.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from typing import Any, Dict
 
 from pydantic import BaseModel, Field
 
@@ -20,3 +21,15 @@ class DatabricksImplConfig(BaseModel):
         default=None,
         description="The Databricks API token",
     )
+
+    @classmethod
+    def sample_run_config(
+        cls,
+        url: str = "${env.DATABRICKS_URL}",
+        api_token: str = "${env.DATABRICKS_API_TOKEN}",
+        **kwargs: Any,
+    ) -> Dict[str, Any]:
+        return {
+            "url": url,
+            "api_token": api_token,
+        }
diff --git a/llama_stack/providers/remote/inference/runpod/__init__.py b/llama_stack/providers/remote/inference/runpod/__init__.py
index dcdfa9a84..69bf95046 100644
--- a/llama_stack/providers/remote/inference/runpod/__init__.py
+++ b/llama_stack/providers/remote/inference/runpod/__init__.py
@@ -5,10 +5,11 @@
 # the root directory of this source tree.
 
 from .config import RunpodImplConfig
-from .runpod import RunpodInferenceAdapter
 
 
 async def get_adapter_impl(config: RunpodImplConfig, _deps):
+    from .runpod import RunpodInferenceAdapter
+
     assert isinstance(config, RunpodImplConfig), f"Unexpected config type: {type(config)}"
     impl = RunpodInferenceAdapter(config)
     await impl.initialize()
diff --git a/llama_stack/providers/remote/inference/runpod/config.py b/llama_stack/providers/remote/inference/runpod/config.py
index e59cfe59b..377a7fe6a 100644
--- a/llama_stack/providers/remote/inference/runpod/config.py
+++ b/llama_stack/providers/remote/inference/runpod/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Optional
+from typing import Any, Dict, Optional
 
 from pydantic import BaseModel, Field
 
@@ -21,3 +21,10 @@ class RunpodImplConfig(BaseModel):
         default=None,
         description="The API token",
     )
+
+    @classmethod
+    def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "url": "${env.RUNPOD_URL:}",
+            "api_token": "${env.RUNPOD_API_TOKEN:}",
+        }
diff --git a/llama_stack/providers/remote/inference/runpod/runpod.py b/llama_stack/providers/remote/inference/runpod/runpod.py
index 783842f71..72f858cd8 100644
--- a/llama_stack/providers/remote/inference/runpod/runpod.py
+++ b/llama_stack/providers/remote/inference/runpod/runpod.py
@@ -8,7 +8,6 @@ from typing import AsyncGenerator
 from openai import OpenAI
 
 from llama_stack.apis.inference import *  # noqa: F403
-from llama_stack.models.llama.datatypes import Message
 
 # from llama_stack.providers.datatypes import ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
diff --git a/llama_stack/providers/remote/inference/sample/__init__.py b/llama_stack/providers/remote/inference/sample/__init__.py
deleted file mode 100644
index 13263744e..000000000
--- a/llama_stack/providers/remote/inference/sample/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from .config import SampleConfig
-
-
-async def get_adapter_impl(config: SampleConfig, _deps) -> Any:
-    from .sample import SampleInferenceImpl
-
-    impl = SampleInferenceImpl(config)
-    await impl.initialize()
-    return impl
diff --git a/llama_stack/providers/remote/inference/sample/config.py b/llama_stack/providers/remote/inference/sample/config.py
deleted file mode 100644
index 4b7404a26..000000000
--- a/llama_stack/providers/remote/inference/sample/config.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from pydantic import BaseModel
-
-
-class SampleConfig(BaseModel):
-    host: str = "localhost"
-    port: int = 9999
diff --git a/llama_stack/providers/remote/inference/sample/sample.py b/llama_stack/providers/remote/inference/sample/sample.py
deleted file mode 100644
index 106381618..000000000
--- a/llama_stack/providers/remote/inference/sample/sample.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.apis.inference import Inference
-from llama_stack.apis.models import Model
-
-from .config import SampleConfig
-
-
-class SampleInferenceImpl(Inference):
-    def __init__(self, config: SampleConfig):
-        self.config = config
-
-    async def register_model(self, model: Model) -> None:
-        # these are the model names the Llama Stack will use to route requests to this provider
-        # perform validation here if necessary
-        pass
-
-    async def initialize(self):
-        pass
diff --git a/llama_stack/providers/remote/safety/sample/__init__.py b/llama_stack/providers/remote/safety/sample/__init__.py
deleted file mode 100644
index 83a8d0890..000000000
--- a/llama_stack/providers/remote/safety/sample/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from .config import SampleConfig
-
-
-async def get_adapter_impl(config: SampleConfig, _deps) -> Any:
-    from .sample import SampleSafetyImpl
-
-    impl = SampleSafetyImpl(config)
-    await impl.initialize()
-    return impl
diff --git a/llama_stack/providers/remote/safety/sample/config.py b/llama_stack/providers/remote/safety/sample/config.py
deleted file mode 100644
index 4b7404a26..000000000
--- a/llama_stack/providers/remote/safety/sample/config.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from pydantic import BaseModel
-
-
-class SampleConfig(BaseModel):
-    host: str = "localhost"
-    port: int = 9999
diff --git a/llama_stack/providers/remote/safety/sample/sample.py b/llama_stack/providers/remote/safety/sample/sample.py
deleted file mode 100644
index 7645c69e9..000000000
--- a/llama_stack/providers/remote/safety/sample/sample.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.apis.safety import Safety
-from llama_stack.apis.shields import Shield
-
-from .config import SampleConfig
-
-
-class SampleSafetyImpl(Safety):
-    def __init__(self, config: SampleConfig):
-        self.config = config
-
-    async def register_shield(self, shield: Shield) -> None:
-        # these are the safety shields the Llama Stack will use to route requests to this provider
-        # perform validation here if necessary
-        pass
-
-    async def initialize(self):
-        pass
diff --git a/llama_stack/providers/remote/tool_runtime/bing_search/config.py b/llama_stack/providers/remote/tool_runtime/bing_search/config.py
index 67283d8d5..4f089439f 100644
--- a/llama_stack/providers/remote/tool_runtime/bing_search/config.py
+++ b/llama_stack/providers/remote/tool_runtime/bing_search/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Optional
+from typing import Any, Dict, Optional
 
 from pydantic import BaseModel
 
@@ -14,3 +14,9 @@ class BingSearchToolConfig(BaseModel):
 
     api_key: Optional[str] = None
     top_k: int = 3
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "api_key": "${env.BING_API_KEY:}",
+        }
diff --git a/llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py b/llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py
index ffe4c9887..30ac407bc 100644
--- a/llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py
+++ b/llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py
@@ -4,8 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from typing import Any, Dict
+
 from pydantic import BaseModel
 
 
 class ModelContextProtocolConfig(BaseModel):
-    pass
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {}
diff --git a/llama_stack/providers/remote/tool_runtime/wolfram_alpha/config.py b/llama_stack/providers/remote/tool_runtime/wolfram_alpha/config.py
index 13996b639..8ea49c7b5 100644
--- a/llama_stack/providers/remote/tool_runtime/wolfram_alpha/config.py
+++ b/llama_stack/providers/remote/tool_runtime/wolfram_alpha/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Optional
+from typing import Any, Dict, Optional
 
 from pydantic import BaseModel
 
@@ -13,3 +13,9 @@ class WolframAlphaToolConfig(BaseModel):
     """Configuration for WolframAlpha Tool Runtime"""
 
     api_key: Optional[str] = None
+
+    @classmethod
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "api_key": "${env.WOLFRAM_ALPHA_API_KEY:}",
+        }
diff --git a/llama_stack/providers/remote/vector_io/qdrant/config.py b/llama_stack/providers/remote/vector_io/qdrant/config.py
index f212882d8..ce68aa492 100644
--- a/llama_stack/providers/remote/vector_io/qdrant/config.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/config.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Optional
+from typing import Any, Dict, Optional
 
 from pydantic import BaseModel
 
@@ -24,3 +24,9 @@ class QdrantVectorIOConfig(BaseModel):
     timeout: Optional[int] = None
     host: Optional[str] = None
     path: Optional[str] = None
+
+    @classmethod
+    def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "api_key": "${env.QDRANT_API_KEY}",
+        }
diff --git a/llama_stack/providers/remote/vector_io/sample/__init__.py b/llama_stack/providers/remote/vector_io/sample/__init__.py
deleted file mode 100644
index 221f47b1c..000000000
--- a/llama_stack/providers/remote/vector_io/sample/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from .config import SampleVectorIOConfig
-
-
-async def get_adapter_impl(config: SampleVectorIOConfig, _deps) -> Any:
-    from .sample import SampleVectorIOImpl
-
-    impl = SampleVectorIOImpl(config)
-    await impl.initialize()
-    return impl
diff --git a/llama_stack/providers/remote/vector_io/sample/config.py b/llama_stack/providers/remote/vector_io/sample/config.py
deleted file mode 100644
index 5126e5eff..000000000
--- a/llama_stack/providers/remote/vector_io/sample/config.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from pydantic import BaseModel
-
-
-class SampleVectorIOConfig(BaseModel):
-    host: str = "localhost"
-    port: int = 9999
diff --git a/llama_stack/providers/remote/vector_io/sample/sample.py b/llama_stack/providers/remote/vector_io/sample/sample.py
deleted file mode 100644
index cb7193cf4..000000000
--- a/llama_stack/providers/remote/vector_io/sample/sample.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.apis.vector_dbs import VectorDB
-from llama_stack.apis.vector_io import VectorIO
-
-from .config import SampleVectorIOConfig
-
-
-class SampleVectorIOImpl(VectorIO):
-    def __init__(self, config: SampleVectorIOConfig):
-        self.config = config
-
-    async def register_vector_db(self, vector_db: VectorDB) -> None:
-        # these are the vector dbs the Llama Stack will use to route requests to this provider
-        # perform validation here if necessary
-        pass
-
-    async def initialize(self):
-        pass
-
-    async def shutdown(self):
-        pass
diff --git a/llama_stack/providers/remote/vector_io/weaviate/config.py b/llama_stack/providers/remote/vector_io/weaviate/config.py
index 6aad9a5a6..cc587f252 100644
--- a/llama_stack/providers/remote/vector_io/weaviate/config.py
+++ b/llama_stack/providers/remote/vector_io/weaviate/config.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+from typing import Any, Dict
+
 from pydantic import BaseModel
 
 
@@ -13,4 +15,6 @@ class WeaviateRequestProviderData(BaseModel):
 
 
 class WeaviateVectorIOConfig(BaseModel):
-    pass
+    @classmethod
+    def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]:
+        return {}
diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml
index 00a02e0d5..39ed8cf48 100644
--- a/llama_stack/templates/bedrock/run.yaml
+++ b/llama_stack/templates/bedrock/run.yaml
@@ -45,14 +45,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
diff --git a/llama_stack/templates/cerebras/run.yaml b/llama_stack/templates/cerebras/run.yaml
index 43d3158ba..8315f75d5 100644
--- a/llama_stack/templates/cerebras/run.yaml
+++ b/llama_stack/templates/cerebras/run.yaml
@@ -23,7 +23,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   vector_io:
   - provider_id: faiss
     provider_type: inline::faiss
@@ -43,14 +44,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
diff --git a/llama_stack/templates/ci-tests/run.yaml b/llama_stack/templates/ci-tests/run.yaml
index 715d7c86d..ae2b3912c 100644
--- a/llama_stack/templates/ci-tests/run.yaml
+++ b/llama_stack/templates/ci-tests/run.yaml
@@ -28,7 +28,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -47,14 +48,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ci-tests}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ci-tests}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ci-tests}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
diff --git a/llama_stack/templates/dell/run-with-safety.yaml b/llama_stack/templates/dell/run-with-safety.yaml
index ddec3a715..8a62a5a42 100644
--- a/llama_stack/templates/dell/run-with-safety.yaml
+++ b/llama_stack/templates/dell/run-with-safety.yaml
@@ -31,7 +31,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -50,14 +51,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
diff --git a/llama_stack/templates/dell/run.yaml b/llama_stack/templates/dell/run.yaml
index 9394c94ef..31c63bd83 100644
--- a/llama_stack/templates/dell/run.yaml
+++ b/llama_stack/templates/dell/run.yaml
@@ -27,7 +27,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -46,14 +47,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
diff --git a/llama_stack/templates/dev/run.yaml b/llama_stack/templates/dev/run.yaml
index f908af8c3..dba13b357 100644
--- a/llama_stack/templates/dev/run.yaml
+++ b/llama_stack/templates/dev/run.yaml
@@ -57,7 +57,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -76,14 +77,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml
index e04141a07..2d79a3548 100644
--- a/llama_stack/templates/fireworks/run-with-safety.yaml
+++ b/llama_stack/templates/fireworks/run-with-safety.yaml
@@ -56,14 +56,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
@@ -88,7 +100,8 @@ providers:
       max_results: 3
   - provider_id: wolfram-alpha
     provider_type: remote::wolfram-alpha
-    config: {}
+    config:
+      api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
   - provider_id: code-interpreter
     provider_type: inline::code-interpreter
     config: {}
diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml
index 369b9ae7b..285495ad9 100644
--- a/llama_stack/templates/fireworks/run.yaml
+++ b/llama_stack/templates/fireworks/run.yaml
@@ -31,7 +31,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -50,14 +51,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
@@ -82,7 +95,8 @@ providers:
       max_results: 3
   - provider_id: wolfram-alpha
     provider_type: remote::wolfram-alpha
-    config: {}
+    config:
+      api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
   - provider_id: code-interpreter
     provider_type: inline::code-interpreter
     config: {}
diff --git a/llama_stack/templates/groq/run.yaml b/llama_stack/templates/groq/run.yaml
index 78212c8d9..6afea2355 100644
--- a/llama_stack/templates/groq/run.yaml
+++ b/llama_stack/templates/groq/run.yaml
@@ -31,7 +31,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -50,14 +51,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/groq}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/groq}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/groq}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
index 867d7a076..f6f23a987 100644
--- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml
+++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
@@ -36,7 +36,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -55,14 +56,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml
index d60acdefd..461f97128 100644
--- a/llama_stack/templates/hf-endpoint/run.yaml
+++ b/llama_stack/templates/hf-endpoint/run.yaml
@@ -31,7 +31,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -50,14 +51,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml
index e58ad15b3..7f1724f34 100644
--- a/llama_stack/templates/hf-serverless/run-with-safety.yaml
+++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml
@@ -36,7 +36,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -55,14 +56,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml
index 5045e821a..ac013488b 100644
--- a/llama_stack/templates/hf-serverless/run.yaml
+++ b/llama_stack/templates/hf-serverless/run.yaml
@@ -31,7 +31,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -50,14 +51,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
index caac65c8c..190c08494 100644
--- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
@@ -38,7 +38,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -57,14 +58,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml
index bade9a076..07763a4df 100644
--- a/llama_stack/templates/meta-reference-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run.yaml
@@ -32,7 +32,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -51,14 +52,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
index f131e8ea6..51b9dc250 100644
--- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
@@ -34,7 +34,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -53,14 +54,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml
index 52e78df7b..213e22cb2 100644
--- a/llama_stack/templates/nvidia/run.yaml
+++ b/llama_stack/templates/nvidia/run.yaml
@@ -28,7 +28,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -47,14 +48,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml
index a96031272..2b8eb44db 100644
--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@@ -49,14 +49,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
@@ -90,7 +102,8 @@ providers:
     config: {}
   - provider_id: wolfram-alpha
     provider_type: remote::wolfram-alpha
-    config: {}
+    config:
+      api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/registry.db
diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml
index 661d880a7..c9531f417 100644
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@@ -27,7 +27,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -46,14 +47,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
@@ -87,7 +100,8 @@ providers:
     config: {}
   - provider_id: wolfram-alpha
     provider_type: remote::wolfram-alpha
-    config: {}
+    config:
+      api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/registry.db
diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml
index 97c54e621..5ef25435b 100644
--- a/llama_stack/templates/open-benchmark/run.yaml
+++ b/llama_stack/templates/open-benchmark/run.yaml
@@ -54,7 +54,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -73,14 +74,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml
index 45af8427a..9741f5302 100644
--- a/llama_stack/templates/remote-vllm/run-with-safety.yaml
+++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml
@@ -38,7 +38,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -50,14 +51,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
@@ -98,7 +111,8 @@ providers:
     config: {}
   - provider_id: wolfram-alpha
     provider_type: remote::wolfram-alpha
-    config: {}
+    config:
+      api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db
diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml
index 674085045..e26b20e88 100644
--- a/llama_stack/templates/remote-vllm/run.yaml
+++ b/llama_stack/templates/remote-vllm/run.yaml
@@ -32,7 +32,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -44,14 +45,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
@@ -92,7 +105,8 @@ providers:
     config: {}
   - provider_id: wolfram-alpha
     provider_type: remote::wolfram-alpha
-    config: {}
+    config:
+      api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db
diff --git a/llama_stack/templates/sambanova/run.yaml b/llama_stack/templates/sambanova/run.yaml
index cfa0cc194..616d82a61 100644
--- a/llama_stack/templates/sambanova/run.yaml
+++ b/llama_stack/templates/sambanova/run.yaml
@@ -37,7 +37,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml
index e1d85f59a..db54c0393 100644
--- a/llama_stack/templates/tgi/run-with-safety.yaml
+++ b/llama_stack/templates/tgi/run-with-safety.yaml
@@ -31,7 +31,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -50,14 +51,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml
index fc73e0978..dafb59aa9 100644
--- a/llama_stack/templates/tgi/run.yaml
+++ b/llama_stack/templates/tgi/run.yaml
@@ -30,7 +30,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -49,14 +50,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml
index 3a7d3dfba..e0bf46c11 100644
--- a/llama_stack/templates/together/run-with-safety.yaml
+++ b/llama_stack/templates/together/run-with-safety.yaml
@@ -56,14 +56,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
@@ -97,7 +109,8 @@ providers:
     config: {}
   - provider_id: wolfram-alpha
     provider_type: remote::wolfram-alpha
-    config: {}
+    config:
+      api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/registry.db
diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml
index 10668914a..9d0acaf31 100644
--- a/llama_stack/templates/together/run.yaml
+++ b/llama_stack/templates/together/run.yaml
@@ -31,7 +31,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -50,14 +51,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
@@ -91,7 +104,8 @@ providers:
     config: {}
   - provider_id: wolfram-alpha
     provider_type: remote::wolfram-alpha
-    config: {}
+    config:
+      api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
 metadata_store:
   type: sqlite
   db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/registry.db
diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml
index 8a15ff016..bf85de0a2 100644
--- a/llama_stack/templates/vllm-gpu/run.yaml
+++ b/llama_stack/templates/vllm-gpu/run.yaml
@@ -35,7 +35,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -54,14 +55,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
diff --git a/tests/unit/providers/test_configs.py b/tests/unit/providers/test_configs.py
new file mode 100644
index 000000000..246470372
--- /dev/null
+++ b/tests/unit/providers/test_configs.py
@@ -0,0 +1,50 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import pytest
+from pydantic import BaseModel
+
+from llama_stack.distribution.distribution import get_provider_registry, providable_apis
+from llama_stack.distribution.utils.dynamic import instantiate_class_type
+
+
+class TestProviderConfigurations:
+    """Test suite for testing provider configurations across all API types."""
+
+    def test_all_api_providers_exist(self):
+        provider_registry = get_provider_registry()
+        for api in providable_apis():
+            providers = provider_registry.get(api, {})
+            assert providers, f"No providers found for API type: {api}"
+
+    @pytest.mark.parametrize("api", providable_apis())
+    def test_api_providers(self, api):
+        provider_registry = get_provider_registry()
+        providers = provider_registry.get(api, {})
+        assert providers, f"No providers found for API type: {api}"
+
+        failures = []
+        for provider_type, provider_spec in providers.items():
+            try:
+                self._verify_provider_config(provider_type, provider_spec)
+            except Exception as e:
+                failures.append(f"Failed to verify {provider_type} config: {str(e)}")
+
+        if failures:
+            pytest.fail("\n".join(failures))
+
+    def _verify_provider_config(self, provider_type, provider_spec):
+        """Helper method to verify a single provider configuration."""
+        # Get the config class
+        config_class_name = provider_spec.config_class
+        config_type = instantiate_class_type(config_class_name)
+
+        assert issubclass(config_type, BaseModel), f"{config_class_name} is not a subclass of BaseModel"
+
+        assert hasattr(config_type, "sample_run_config"), f"{config_class_name} does not have sample_run_config method"
+
+        sample_config = config_type.sample_run_config(__distro_dir__="foobarbaz")
+        assert isinstance(sample_config, dict), f"{config_class_name}.sample_run_config() did not return a dict"

From 9617468d13edfa98d4ae6fa02b645a31d4ad9fe4 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 13 Mar 2025 09:44:26 -0700
Subject: [PATCH 13/38] fix: passthrough provider template + fix (#1612)

# What does this PR do?

- Fix issue w/ passthrough provider


[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
llama stack run

[//]: # (## Documentation)
---
 distributions/dependencies.json               |  34 +++
 .../self_hosted_distro/passthrough.md         |  42 ++++
 llama_stack/templates/passthrough/__init__.py |   7 +
 llama_stack/templates/passthrough/build.yaml  |   4 +-
 .../templates/passthrough/doc_template.md     |  35 +++
 .../templates/passthrough/passthrough.py      | 201 ++++++++++++++++++
 .../passthrough/run-with-safety.yaml          | 154 ++++++++++++++
 llama_stack/templates/passthrough/run.yaml    |  36 +++-
 8 files changed, 506 insertions(+), 7 deletions(-)
 create mode 100644 docs/source/distributions/self_hosted_distro/passthrough.md
 create mode 100644 llama_stack/templates/passthrough/__init__.py
 create mode 100644 llama_stack/templates/passthrough/doc_template.md
 create mode 100644 llama_stack/templates/passthrough/passthrough.py
 create mode 100644 llama_stack/templates/passthrough/run-with-safety.yaml

diff --git a/distributions/dependencies.json b/distributions/dependencies.json
index 82fbcec8d..c3f039247 100644
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@@ -487,6 +487,40 @@
     "transformers",
     "uvicorn"
   ],
+  "passthrough": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pymongo",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
+  ],
   "remote-vllm": [
     "aiosqlite",
     "autoevals",
diff --git a/docs/source/distributions/self_hosted_distro/passthrough.md b/docs/source/distributions/self_hosted_distro/passthrough.md
new file mode 100644
index 000000000..558d7ca08
--- /dev/null
+++ b/docs/source/distributions/self_hosted_distro/passthrough.md
@@ -0,0 +1,42 @@
+---
+orphan: true
+---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
+# Passthrough Distribution
+
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
+The `llamastack/distribution-passthrough` distribution consists of the following provider configurations.
+
+| API | Provider(s) |
+|-----|-------------|
+| agents | `inline::meta-reference` |
+| datasetio | `remote::huggingface`, `inline::localfs` |
+| eval | `inline::meta-reference` |
+| inference | `remote::passthrough`, `inline::sentence-transformers` |
+| safety | `inline::llama-guard` |
+| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
+| telemetry | `inline::meta-reference` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
+
+
+### Environment Variables
+
+The following environment variables can be configured:
+
+- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
+- `PASSTHROUGH_API_KEY`: Passthrough API Key (default: ``)
+- `PASSTHROUGH_URL`: Passthrough URL (default: ``)
+
+### Models
+
+The following models are available by default:
+
+- `llama3.1-8b-instruct `
+- `llama3.2-11b-vision-instruct `
diff --git a/llama_stack/templates/passthrough/__init__.py b/llama_stack/templates/passthrough/__init__.py
new file mode 100644
index 000000000..9632c09fb
--- /dev/null
+++ b/llama_stack/templates/passthrough/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .passthrough import get_distribution_template  # noqa: F401
diff --git a/llama_stack/templates/passthrough/build.yaml b/llama_stack/templates/passthrough/build.yaml
index 5fed5286e..fb1fb1066 100644
--- a/llama_stack/templates/passthrough/build.yaml
+++ b/llama_stack/templates/passthrough/build.yaml
@@ -1,9 +1,10 @@
 version: '2'
 distribution_spec:
-  description: Use for running LLM inference with the endpoint that compatible with Llama Stack API
+  description: Use Passthrough hosted llama-stack endpoint for LLM inference
   providers:
     inference:
     - remote::passthrough
+    - inline::sentence-transformers
     vector_io:
     - inline::faiss
     - remote::chromadb
@@ -26,6 +27,7 @@ distribution_spec:
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
+    - remote::wolfram-alpha
     - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
diff --git a/llama_stack/templates/passthrough/doc_template.md b/llama_stack/templates/passthrough/doc_template.md
new file mode 100644
index 000000000..f9e88873d
--- /dev/null
+++ b/llama_stack/templates/passthrough/doc_template.md
@@ -0,0 +1,35 @@
+---
+orphan: true
+---
+# Passthrough Distribution
+
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
+The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
+
+{{ providers_table }}
+
+{% if run_config_env_vars %}
+### Environment Variables
+
+The following environment variables can be configured:
+
+{% for var, (default_value, description) in run_config_env_vars.items() %}
+- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
+{% endfor %}
+{% endif %}
+
+{% if default_models %}
+### Models
+
+The following models are available by default:
+
+{% for model in default_models %}
+- `{{ model.model_id }} {{ model.doc_string }}`
+{% endfor %}
+{% endif %}
diff --git a/llama_stack/templates/passthrough/passthrough.py b/llama_stack/templates/passthrough/passthrough.py
new file mode 100644
index 000000000..cc3f55937
--- /dev/null
+++ b/llama_stack/templates/passthrough/passthrough.py
@@ -0,0 +1,201 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pathlib import Path
+
+from llama_stack.apis.models.models import ModelType
+from llama_stack.distribution.datatypes import (
+    ModelInput,
+    Provider,
+    ShieldInput,
+    ToolGroupInput,
+)
+from llama_stack.providers.inline.inference.sentence_transformers import (
+    SentenceTransformersInferenceConfig,
+)
+from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
+from llama_stack.providers.remote.inference.passthrough.config import (
+    PassthroughImplConfig,
+)
+from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry
+from llama_stack.templates.template import (
+    DistributionTemplate,
+    RunConfigSettings,
+)
+
+
+def get_distribution_template() -> DistributionTemplate:
+    providers = {
+        "inference": ["remote::passthrough", "inline::sentence-transformers"],
+        "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+        "eval": ["inline::meta-reference"],
+        "datasetio": ["remote::huggingface", "inline::localfs"],
+        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
+        "tool_runtime": [
+            "remote::brave-search",
+            "remote::tavily-search",
+            "remote::wolfram-alpha",
+            "inline::code-interpreter",
+            "inline::rag-runtime",
+            "remote::model-context-protocol",
+        ],
+    }
+
+    name = "passthrough"
+
+    inference_provider = Provider(
+        provider_id="passthrough",
+        provider_type="remote::passthrough",
+        config=PassthroughImplConfig.sample_run_config(),
+    )
+    embedding_provider = Provider(
+        provider_id="sentence-transformers",
+        provider_type="inline::sentence-transformers",
+        config=SentenceTransformersInferenceConfig.sample_run_config(),
+    )
+    vector_io_provider = Provider(
+        provider_id="faiss",
+        provider_type="inline::faiss",
+        config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+    )
+
+    default_models = [
+        ModelInput(
+            metadata={},
+            model_id="meta-llama/Llama-3.1-8B-Instruct",
+            provider_id="passthrough",
+            provider_model_id="llama3.1-8b-instruct",
+            model_type=ModelType.llm,
+        ),
+        ModelInput(
+            metadata={},
+            model_id="meta-llama/Llama-3.2-11B-Vision-Instruct",
+            provider_id="passthrough",
+            provider_model_id="llama3.2-11b-vision-instruct",
+            model_type=ModelType.llm,
+        ),
+    ]
+
+    embedding_model = ModelInput(
+        model_id="all-MiniLM-L6-v2",
+        provider_id="sentence-transformers",
+        model_type=ModelType.embedding,
+        metadata={
+            "embedding_dimension": 384,
+        },
+    )
+    default_tool_groups = [
+        ToolGroupInput(
+            toolgroup_id="builtin::websearch",
+            provider_id="tavily-search",
+        ),
+        ToolGroupInput(
+            toolgroup_id="builtin::wolfram_alpha",
+            provider_id="wolfram-alpha",
+        ),
+        ToolGroupInput(
+            toolgroup_id="builtin::rag",
+            provider_id="rag-runtime",
+        ),
+        ToolGroupInput(
+            toolgroup_id="builtin::code_interpreter",
+            provider_id="code-interpreter",
+        ),
+    ]
+
+    return DistributionTemplate(
+        name=name,
+        distro_type="self_hosted",
+        description="Use Passthrough hosted llama-stack endpoint for LLM inference",
+        container_image=None,
+        template_path=Path(__file__).parent / "doc_template.md",
+        providers=providers,
+        available_models_by_provider={
+            "passthrough": [
+                ProviderModelEntry(
+                    provider_model_id="llama3.1-8b-instruct",
+                    model_type=ModelType.llm,
+                ),
+                ProviderModelEntry(
+                    provider_model_id="llama3.2-11b-vision-instruct",
+                    model_type=ModelType.llm,
+                ),
+            ],
+        },
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [inference_provider, embedding_provider],
+                    "vector_io": [vector_io_provider],
+                },
+                default_models=default_models + [embedding_model],
+                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
+                default_tool_groups=default_tool_groups,
+            ),
+            "run-with-safety.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [
+                        inference_provider,
+                        embedding_provider,
+                    ],
+                    "vector_io": [vector_io_provider],
+                    "safety": [
+                        Provider(
+                            provider_id="llama-guard",
+                            provider_type="inline::llama-guard",
+                            config={},
+                        ),
+                        Provider(
+                            provider_id="llama-guard-vision",
+                            provider_type="inline::llama-guard",
+                            config={},
+                        ),
+                        Provider(
+                            provider_id="code-scanner",
+                            provider_type="inline::code-scanner",
+                            config={},
+                        ),
+                    ],
+                },
+                default_models=[
+                    *default_models,
+                    embedding_model,
+                ],
+                default_shields=[
+                    ShieldInput(
+                        shield_id="meta-llama/Llama-Guard-3-8B",
+                        provider_id="llama-guard",
+                    ),
+                    ShieldInput(
+                        shield_id="meta-llama/Llama-Guard-3-11B-Vision",
+                        provider_id="llama-guard-vision",
+                    ),
+                    ShieldInput(
+                        shield_id="CodeScanner",
+                        provider_id="code-scanner",
+                    ),
+                ],
+                default_tool_groups=default_tool_groups,
+            ),
+        },
+        run_config_env_vars={
+            "LLAMA_STACK_PORT": (
+                "5001",
+                "Port for the Llama Stack distribution server",
+            ),
+            "PASSTHROUGH_API_KEY": (
+                "",
+                "Passthrough API Key",
+            ),
+            "PASSTHROUGH_URL": (
+                "",
+                "Passthrough URL",
+            ),
+        },
+    )
diff --git a/llama_stack/templates/passthrough/run-with-safety.yaml b/llama_stack/templates/passthrough/run-with-safety.yaml
new file mode 100644
index 000000000..fbfa4afe7
--- /dev/null
+++ b/llama_stack/templates/passthrough/run-with-safety.yaml
@@ -0,0 +1,154 @@
+version: '2'
+image_name: passthrough
+apis:
+- agents
+- datasetio
+- eval
+- inference
+- safety
+- scoring
+- telemetry
+- tool_runtime
+- vector_io
+providers:
+  inference:
+  - provider_id: passthrough
+    provider_type: remote::passthrough
+    config:
+      url: ${env.PASSTHROUGH_URL}
+      api_key: ${env.PASSTHROUGH_API_KEY}
+  - provider_id: sentence-transformers
+    provider_type: inline::sentence-transformers
+    config: {}
+  vector_io:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  - provider_id: llama-guard-vision
+    provider_type: inline::llama-guard
+    config: {}
+  - provider_id: code-scanner
+    provider_type: inline::code-scanner
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
+      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
+      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/passthrough/trace_store.db}
+  eval:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/meta_reference_eval.db
+  datasetio:
+  - provider_id: huggingface
+    provider_type: remote::huggingface
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/huggingface_datasetio.db
+  - provider_id: localfs
+    provider_type: inline::localfs
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/localfs_datasetio.db
+  scoring:
+  - provider_id: basic
+    provider_type: inline::basic
+    config: {}
+  - provider_id: llm-as-judge
+    provider_type: inline::llm-as-judge
+    config: {}
+  - provider_id: braintrust
+    provider_type: inline::braintrust
+    config:
+      openai_api_key: ${env.OPENAI_API_KEY:}
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: ${env.BRAVE_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: tavily-search
+    provider_type: remote::tavily-search
+    config:
+      api_key: ${env.TAVILY_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: wolfram-alpha
+    provider_type: remote::wolfram-alpha
+    config:
+      api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
+  - provider_id: code-interpreter
+    provider_type: inline::code-interpreter
+    config: {}
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+    config: {}
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+    config: {}
+metadata_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/registry.db
+models:
+- metadata: {}
+  model_id: meta-llama/Llama-3.1-8B-Instruct
+  provider_id: passthrough
+  provider_model_id: llama3.1-8b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
+  provider_id: passthrough
+  provider_model_id: llama3.2-11b-vision-instruct
+  model_type: llm
+- metadata:
+    embedding_dimension: 384
+  model_id: all-MiniLM-L6-v2
+  provider_id: sentence-transformers
+  model_type: embedding
+shields:
+- shield_id: meta-llama/Llama-Guard-3-8B
+  provider_id: llama-guard
+- shield_id: meta-llama/Llama-Guard-3-11B-Vision
+  provider_id: llama-guard-vision
+- shield_id: CodeScanner
+  provider_id: code-scanner
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::websearch
+  provider_id: tavily-search
+- toolgroup_id: builtin::wolfram_alpha
+  provider_id: wolfram-alpha
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
+- toolgroup_id: builtin::code_interpreter
+  provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/passthrough/run.yaml b/llama_stack/templates/passthrough/run.yaml
index 2548faa5d..6956bc6e3 100644
--- a/llama_stack/templates/passthrough/run.yaml
+++ b/llama_stack/templates/passthrough/run.yaml
@@ -31,7 +31,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -50,14 +51,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
@@ -80,6 +93,10 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
+  - provider_id: wolfram-alpha
+    provider_type: remote::wolfram-alpha
+    config:
+      api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
   - provider_id: code-interpreter
     provider_type: inline::code-interpreter
     config: {}
@@ -91,7 +108,7 @@ providers:
     config: {}
 metadata_store:
   type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-llama}/registry.db
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/registry.db
 models:
 - metadata: {}
   model_id: meta-llama/Llama-3.1-8B-Instruct
@@ -103,15 +120,22 @@ models:
   provider_id: passthrough
   provider_model_id: llama3.2-11b-vision-instruct
   model_type: llm
+- metadata:
+    embedding_dimension: 384
+  model_id: all-MiniLM-L6-v2
+  provider_id: sentence-transformers
+  model_type: embedding
 shields:
 - shield_id: meta-llama/Llama-Guard-3-8B
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
+- toolgroup_id: builtin::wolfram_alpha
+  provider_id: wolfram-alpha
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter

From 5e54113b1930bc92b63f7d025d064fdc4986c95a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Thu, 13 Mar 2025 18:14:01 +0100
Subject: [PATCH 14/38] ci: add dynamic CI job to test templates (#1230)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

Introduced a new CI job that dynamically generates a build matrix based
on available templates from `llama_stack/templates/*/build.yaml`.

This allows automated testing for all templates without manual
intervention.

The CI currently builds for venv and containers.

Signed-off-by: Sébastien Han <seb@redhat.com>

~Will pass once https://github.com/meta-llama/llama-stack/pull/1228
merges.~

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 .github/workflows/providers-build.yml | 76 +++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 .github/workflows/providers-build.yml

diff --git a/.github/workflows/providers-build.yml b/.github/workflows/providers-build.yml
new file mode 100644
index 000000000..1bf6591dc
--- /dev/null
+++ b/.github/workflows/providers-build.yml
@@ -0,0 +1,76 @@
+name: Test Llama Stack Build
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'llama_stack/cli/stack/build.py'
+      - 'llama_stack/cli/stack/_build.py'
+      - 'llama_stack/distribution/build.*'
+      - 'llama_stack/distribution/*.sh'
+      - '.github/workflows/providers-build.yml'
+  pull_request:
+    paths:
+      - 'llama_stack/cli/stack/build.py'
+      - 'llama_stack/cli/stack/_build.py'
+      - 'llama_stack/distribution/build.*'
+      - 'llama_stack/distribution/*.sh'
+      - '.github/workflows/providers-build.yml'
+
+jobs:
+  generate-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      templates: ${{ steps.set-matrix.outputs.templates }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Generate Template List
+        id: set-matrix
+        run: |
+          templates=$(ls llama_stack/templates/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
+          echo "templates=$templates" >> "$GITHUB_OUTPUT"
+
+  build:
+    needs: generate-matrix
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        template: ${{ fromJson(needs.generate-matrix.outputs.templates) }}
+        image-type: [venv, container]
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          python-version: "3.10"
+
+      - name: Install LlamaStack
+        run: |
+          uv venv
+          source .venv/bin/activate
+          uv pip install -e .
+
+      - name: Print build dependencies
+        run: |
+          uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
+
+      - name: Run Llama Stack Build
+        run: |
+          uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test
+
+      - name: Print dependencies in the image
+        if: matrix.image-type == 'venv'
+        run: |
+          source test/bin/activate
+          uv pip list

From 98811cc0347b4921e772236a3c5117d73c98600e Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 13 Mar 2025 11:01:52 -0700
Subject: [PATCH 15/38] fix: clean up test imports (#1600)

# What does this PR do?
- Clean up dead SDK code in
https://github.com/meta-llama/llama-stack-client-python/pull/198
- Regen for local cache key issue

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
```
pytest -v -s --nbval-lax ./docs/getting_started.ipynb

LLAMA_STACK_CONFIG=fireworks pytest -v tests/integration/ --text-model meta-llama/Llama-3.3-70B-Instruct
```

- CI:
https://github.com/meta-llama/llama-stack-ops/actions/runs/13823512113
<img width="1658" alt="image"
src="https://github.com/user-attachments/assets/1a2de383-35a2-47a0-8d80-d666d4970c34"
/>


[//]: # (## Documentation)
---
 tests/integration/agents/test_agents.py       |  5 ++--
 tests/integration/datasetio/test_datasetio.py | 17 ++++++++++--
 tests/integration/scoring/test_scoring.py     | 26 +++++++++++++------
 3 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/tests/integration/agents/test_agents.py b/tests/integration/agents/test_agents.py
index f6bde8927..61249ad17 100644
--- a/tests/integration/agents/test_agents.py
+++ b/tests/integration/agents/test_agents.py
@@ -10,8 +10,7 @@ from uuid import uuid4
 import pytest
 from llama_stack_client.lib.agents.agent import Agent
 from llama_stack_client.lib.agents.event_logger import EventLogger
-from llama_stack_client.types.agents.turn_create_params import Document as AgentDocument
-from llama_stack_client.types.memory_insert_params import Document
+from llama_stack_client.types.agents.turn_create_params import Document
 from llama_stack_client.types.shared_params.agent_config import AgentConfig, ToolConfig
 
 from llama_stack.apis.agents.agents import (
@@ -242,7 +241,7 @@ def test_code_interpreter_for_attachments(llama_stack_client_with_mocked_inferen
 
     codex_agent = Agent(llama_stack_client_with_mocked_inference, **agent_config)
     session_id = codex_agent.create_session(f"test-session-{uuid4()}")
-    inflation_doc = AgentDocument(
+    inflation_doc = Document(
         content="https://raw.githubusercontent.com/meta-llama/llama-stack-apps/main/examples/resources/inflation.csv",
         mime_type="text/csv",
     )
diff --git a/tests/integration/datasetio/test_datasetio.py b/tests/integration/datasetio/test_datasetio.py
index f112071a6..459589e7b 100644
--- a/tests/integration/datasetio/test_datasetio.py
+++ b/tests/integration/datasetio/test_datasetio.py
@@ -9,11 +9,25 @@ import mimetypes
 import os
 from pathlib import Path
 
+import pytest
+
 # How to run this test:
 #
 # LLAMA_STACK_CONFIG="template-name" pytest -v tests/integration/datasetio
 
 
+@pytest.fixture
+def dataset_for_test(llama_stack_client):
+    dataset_id = "test_dataset"
+    register_dataset(llama_stack_client, dataset_id=dataset_id)
+    yield
+    # Teardown - this always runs, even if the test fails
+    try:
+        llama_stack_client.datasets.unregister(dataset_id)
+    except Exception as e:
+        print(f"Warning: Failed to unregister test_dataset: {e}")
+
+
 def data_url_from_file(file_path: str) -> str:
     if not os.path.exists(file_path):
         raise FileNotFoundError(f"File not found: {file_path}")
@@ -80,8 +94,7 @@ def test_register_unregister_dataset(llama_stack_client):
     assert len(response) == 0
 
 
-def test_get_rows_paginated(llama_stack_client):
-    register_dataset(llama_stack_client)
+def test_get_rows_paginated(llama_stack_client, dataset_for_test):
     response = llama_stack_client.datasetio.get_rows_paginated(
         dataset_id="test_dataset",
         rows_in_page=3,
diff --git a/tests/integration/scoring/test_scoring.py b/tests/integration/scoring/test_scoring.py
index 2fcdf54e2..970a96f40 100644
--- a/tests/integration/scoring/test_scoring.py
+++ b/tests/integration/scoring/test_scoring.py
@@ -10,6 +10,19 @@ import pytest
 from ..datasetio.test_datasetio import register_dataset
 
 
+@pytest.fixture
+def rag_dataset_for_test(llama_stack_client):
+    dataset_id = "test_dataset"
+    register_dataset(llama_stack_client, for_rag=True, dataset_id=dataset_id)
+    yield  # This is where the test function will run
+
+    # Teardown - this always runs, even if the test fails
+    try:
+        llama_stack_client.datasets.unregister(dataset_id)
+    except Exception as e:
+        print(f"Warning: Failed to unregister test_dataset: {e}")
+
+
 @pytest.fixture
 def sample_judge_prompt_template():
     return "Output a number response in the following format: Score: <number>, where <number> is the number between 0 and 9."
@@ -79,9 +92,7 @@ def test_scoring_functions_register(
     # TODO: add unregister api for scoring functions
 
 
-def test_scoring_score(llama_stack_client):
-    register_dataset(llama_stack_client, for_rag=True)
-
+def test_scoring_score(llama_stack_client, rag_dataset_for_test):
     # scoring individual rows
     rows = llama_stack_client.datasetio.get_rows_paginated(
         dataset_id="test_dataset",
@@ -115,9 +126,9 @@ def test_scoring_score(llama_stack_client):
         assert len(response.results[x].score_rows) == 5
 
 
-def test_scoring_score_with_params_llm_as_judge(llama_stack_client, sample_judge_prompt_template, judge_model_id):
-    register_dataset(llama_stack_client, for_rag=True)
-
+def test_scoring_score_with_params_llm_as_judge(
+    llama_stack_client, sample_judge_prompt_template, judge_model_id, rag_dataset_for_test
+):
     # scoring individual rows
     rows = llama_stack_client.datasetio.get_rows_paginated(
         dataset_id="test_dataset",
@@ -167,9 +178,8 @@ def test_scoring_score_with_params_llm_as_judge(llama_stack_client, sample_judge
     ],
 )
 def test_scoring_score_with_aggregation_functions(
-    llama_stack_client, sample_judge_prompt_template, judge_model_id, provider_id
+    llama_stack_client, sample_judge_prompt_template, judge_model_id, provider_id, rag_dataset_for_test
 ):
-    register_dataset(llama_stack_client, for_rag=True)
     rows = llama_stack_client.datasetio.get_rows_paginated(
         dataset_id="test_dataset",
         rows_in_page=3,

From 42788a9d505950085dfde878fc51620f3b334b5c Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Thu, 13 Mar 2025 11:21:10 -0700
Subject: [PATCH 16/38] test: re record responses after client sync (#1615)

Summary:

Test Plan:
LLAMA_STACK_CONFIG=fireworks pytest -s -v
tests/integration/agents/test_agents.py --safety-shield
meta-llama/Llama-Guard-3-8B --text-model
meta-llama/Llama-3.1-8B-Instruct --record-responses
---
 .../recorded_responses/chat_completion.json   | 5010 ++++++++++++++++-
 .../recorded_responses/invoke_tool.json       |   42 +-
 2 files changed, 4748 insertions(+), 304 deletions(-)

diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.json b/tests/integration/fixtures/recorded_responses/chat_completion.json
index 8694cc271..37bb28ac2 100644
--- a/tests/integration/fixtures/recorded_responses/chat_completion.json
+++ b/tests/integration/fixtures/recorded_responses/chat_completion.json
@@ -12758,6 +12758,131 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"false\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " boiling point of polyjuice is -100",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " degrees Fahrenheit.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 139
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 23
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 162
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"false\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -12863,6 +12988,207 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\":",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " \"get_boiling_point\", \"parameters\": {\"liquid_name",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\": \"polyjuice\", \"celcius\":",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " \"false\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "celcius": "false",
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "b0413eb2-f446-4e09-910b-7d8ba4375c87",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 91
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 45
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 136
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -13044,6 +13370,207 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "    \"type\": \"function_call\",\n    \"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "name\": \"get_boiling_point\",\n    \"parameters\": {\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "        \"liquid_name\": \"polyjuice\",\n        \"celcius",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\": \"true\"\n    }\n}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "celcius": "true",
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "62095a5a-c53c-4850-9f4f-b3a41699a32b",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 43
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 56
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 99
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -13370,6 +13897,131 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " boiling point of polyjuice is -100\u00b0C",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 85
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 22
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 107
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -13600,6 +14252,131 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " boiling point of polyjuice is -100",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\u00b0C.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 87
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 22
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 109
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": {
     "chunks": [
       {
@@ -13725,6 +14502,458 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\":",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " \"get_boiling_point\",",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " \"parameters\": {\"liquid_name\": \"",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "polyjuice\", \"celcius\": \"true\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "celcius": "true",
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "139fe8b9-7bfc-4dcb-ac0d-da1d97257c6e",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 37
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 47
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "_point_with_metadata\", \"parameters\": {\"liquid",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "_name\": \"polyjuice\", \"celcius\": \"",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "true\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "celcius": "true",
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "49ab2b64-cbcb-4e71-b02c-99026116c45e",
+                "tool_name": "get_boiling_point_with_metadata"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 37
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 47
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -14250,7 +15479,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " customer smiled and said \"hello\" to the friendly store",
+              "text": " customer smiled and said \"hello\" to the",
               "type": "text"
             },
             "event_type": {
@@ -14270,7 +15499,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " clerk.",
+              "text": " friendly store clerk.",
               "type": "text"
             },
             "event_type": {
@@ -17388,7 +18617,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " error message indicates that there is an issue with",
+              "text": " error message indicates that there is an issue with the",
               "type": "text"
             },
             "event_type": {
@@ -17408,7 +18637,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " the import statement. However, the code provided does not contain any",
+              "text": " import statement. However, the code provided does",
               "type": "text"
             },
             "event_type": {
@@ -17428,7 +18657,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " import statements that would cause this error.\n\nTo provide a more accurate",
+              "text": " not contain any import statements that would cause this error.\n\nTo provide",
               "type": "text"
             },
             "event_type": {
@@ -17448,7 +18677,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " answer, I would need to know the contents of the CSV file",
+              "text": " a more accurate answer, I would need to know the contents of the",
               "type": "text"
             },
             "event_type": {
@@ -17468,7 +18697,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " or more information about the error message.\n\nHowever, based on the",
+              "text": " CSV file or more information about the error message.\n\nHowever, based on",
               "type": "text"
             },
             "event_type": {
@@ -17488,7 +18717,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " code provided, it seems like the code is trying to load a",
+              "text": " the code provided, it seems like the intention is to load a CSV",
               "type": "text"
             },
             "event_type": {
@@ -17508,7 +18737,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " CSV file and print some basic information about it. If the file",
+              "text": " file and print some basic information about it. If the file is not",
               "type": "text"
             },
             "event_type": {
@@ -17528,7 +18757,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " is not found or there is an issue with the file path,",
+              "text": " found or there is an issue with the file path, this could cause",
               "type": "text"
             },
             "event_type": {
@@ -17548,7 +18777,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " this could cause an error.\n\nHere is a",
+              "text": " an error.\n\nHere is an updated version of the code that includes some",
               "type": "text"
             },
             "event_type": {
@@ -17568,7 +18797,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " revised version of the code that includes some error",
+              "text": " error handling:\n\n```\nimport pandas as pd\n",
               "type": "text"
             },
             "event_type": {
@@ -17588,7 +18817,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " handling:\n\n```\nimport pandas as pd\nimport code_interpreter",
+              "text": "import code_interpreter\n\ntry:\n    #",
               "type": "text"
             },
             "event_type": {
@@ -17608,7 +18837,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "\n\ntry:\n    # Load the CSV file",
+              "text": " Load the CSV file\n    df = pd.read_csv(\"/",
               "type": "text"
             },
             "event_type": {
@@ -17628,7 +18857,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "\n    df = pd.read_csv(\"/var/folders/cz",
+              "text": "var/folders/cz/vyh7y1d11",
               "type": "text"
             },
             "event_type": {
@@ -17648,7 +18877,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "/vyh7y1d11xg",
+              "text": "xg881lsxsshnc5c0000gn/T/tmpmy",
               "type": "text"
             },
             "event_type": {
@@ -17668,7 +18897,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "881lsxsshnc5",
+              "text": "lybr76/IEQ51fUginflation.csv\")\n\n   ",
               "type": "text"
             },
             "event_type": {
@@ -17688,7 +18917,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "c0000gn/T/tmpflpgiagc/",
+              "text": " # Print the first few rows of the dataframe\n    print(df",
               "type": "text"
             },
             "event_type": {
@@ -17708,7 +18937,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "8S20Zj2Oinflation.csv\")\n\n   ",
+              "text": ".head())\n\n    # Print the data",
               "type": "text"
             },
             "event_type": {
@@ -17728,7 +18957,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " # Print the first few rows of the dataframe\n    print(df.head",
+              "text": " types of each column\n    print(df.dtypes)\n\n    #",
               "type": "text"
             },
             "event_type": {
@@ -17748,7 +18977,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "())\n\n    # Print the data types of each column\n    print",
+              "text": " Print the summary statistics of the dataframe\n",
               "type": "text"
             },
             "event_type": {
@@ -17768,7 +18997,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "(df.dtypes)\n\n    # Print the",
+              "text": "    print(df.describe())\n\nexcept FileNotFoundError:\n    print(\"The",
               "type": "text"
             },
             "event_type": {
@@ -17788,7 +19017,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " summary statistics of the dataframe\n   ",
+              "text": " file was not found.\")\nexcept pd.errors.Empty",
               "type": "text"
             },
             "event_type": {
@@ -17808,7 +19037,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " print(df.describe())\n\nexcept FileNotFoundError:\n    print(\"The file",
+              "text": "DataError:\n    print(\"The file",
               "type": "text"
             },
             "event_type": {
@@ -17828,7 +19057,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " was not found.\")\nexcept pd.errors.EmptyDataError",
+              "text": " is empty.\")\nexcept pd.errors.ParserError:\n",
               "type": "text"
             },
             "event_type": {
@@ -17848,7 +19077,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": ":\n    print(\"The file is empty.\")\nexcept pd.errors.ParserError",
+              "text": "    print(\"An error occurred while parsing the file.\")\n",
               "type": "text"
             },
             "event_type": {
@@ -17868,7 +19097,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": ":\n    print(\"An error occurred while parsing the",
+              "text": "except Exception as e:\n    print(\"An error occurred: \",",
               "type": "text"
             },
             "event_type": {
@@ -17888,7 +19117,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " file.\")\nexcept Exception as e:\n    print",
+              "text": " str(e))\n```\n\nThis code will",
               "type": "text"
             },
             "event_type": {
@@ -17908,7 +19137,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "(\"An error occurred: \", str(e))\n``",
+              "text": " catch specific exceptions that could occur when loading the CSV file and print",
               "type": "text"
             },
             "event_type": {
@@ -17928,47 +19157,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "`\n\nThis code will catch specific exceptions that could occur when loading the",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " CSV file and print a more",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " informative error message.",
+              "text": " a more informative error message.",
               "type": "text"
             },
             "event_type": {
@@ -18007,17 +19196,17 @@
             {
               "metric": "prompt_tokens",
               "unit": null,
-              "value": 393
+              "value": 389
             },
             {
               "metric": "completion_tokens",
               "unit": null,
-              "value": 331
+              "value": 328
             },
             {
               "metric": "total_tokens",
               "unit": null,
-              "value": 724
+              "value": 717
             }
           ]
         }
@@ -18083,7 +19272,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "import pandas as pd\nimport code_interpreter\n\n",
+              "tool_call": "import pandas as pd\nimport code_interpreter",
               "type": "tool_call"
             },
             "event_type": {
@@ -18108,7 +19297,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "# Load the CSV file\ndf = pd.read_csv(\"/var/f",
+              "tool_call": "\n\n# Load the CSV file",
               "type": "tool_call"
             },
             "event_type": {
@@ -18133,7 +19322,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "olders/cz/vyh7y1d11xg881lsx",
+              "tool_call": "\ndf = pd.read_csv(\"/var/folders/c",
               "type": "tool_call"
             },
             "event_type": {
@@ -18158,7 +19347,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "sshnc5c0000gn/T/tmpfl",
+              "tool_call": "z/vyh7y1d11xg881lsxsshnc",
               "type": "tool_call"
             },
             "event_type": {
@@ -18183,7 +19372,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "pgiagc/8S20Zj2Oinflation",
+              "tool_call": "5c0000gn/T/tmpmylybr76/IE",
               "type": "tool_call"
             },
             "event_type": {
@@ -18208,7 +19397,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": ".csv\")\n\n# Print the first few rows of the",
+              "tool_call": "Q51fUginflation.csv\")\n\n# Print the first few",
               "type": "tool_call"
             },
             "event_type": {
@@ -18233,7 +19422,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " dataframe\nprint(df.head())\n\n# Print the data types of each",
+              "tool_call": " rows of the dataframe\nprint(df.head())\n\n# Print the data",
               "type": "tool_call"
             },
             "event_type": {
@@ -18258,7 +19447,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " column\nprint(df.dtypes)\n\n#",
+              "tool_call": " types of each column\nprint(df.dtypes)\n\n",
               "type": "tool_call"
             },
             "event_type": {
@@ -18283,7 +19472,32 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " Print the summary statistics of the dataframe\nprint(df.describe())",
+              "tool_call": "# Print the summary statistics of the",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " dataframe\nprint(df.describe())",
               "type": "tool_call"
             },
             "event_type": {
@@ -18310,9 +19524,9 @@
               },
               "tool_call": {
                 "arguments": {
-                  "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/8S20Zj2Oinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())"
+                  "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpmylybr76/IEQ51fUginflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())"
                 },
-                "call_id": "e999a578-cbd8-4bb8-bc53-deb2fff1ffce",
+                "call_id": "c4c54781-a26e-427d-aea8-6d4b9829bbcc",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -18361,7 +19575,7 @@
             {
               "metric": "prompt_tokens",
               "unit": null,
-              "value": 215
+              "value": 213
             },
             {
               "metric": "completion_tokens",
@@ -18371,7 +19585,7 @@
             {
               "metric": "total_tokens",
               "unit": null,
-              "value": 225
+              "value": 223
             }
           ]
         }
@@ -18462,7 +19676,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " CSV file\ndf = pd.read",
+              "tool_call": " CSV file\ndf = pd.read_csv(\"/var/folders/cz/v",
               "type": "tool_call"
             },
             "event_type": {
@@ -18487,7 +19701,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "_csv(\"/var/folders/cz/vyh",
+              "tool_call": "yh7y1d11xg881lsx",
               "type": "tool_call"
             },
             "event_type": {
@@ -18512,7 +19726,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "7y1d11xg881lsxsshnc5c",
+              "tool_call": "sshnc5c0000gn/T/tmpmylybr76",
               "type": "tool_call"
             },
             "event_type": {
@@ -18537,7 +19751,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "0000gn/T/tmpflpgiagc/8S",
+              "tool_call": "/IEQ51fUginflation.csv\")\n\n",
               "type": "tool_call"
             },
             "event_type": {
@@ -18562,7 +19776,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "20Zj2Oinflation.csv\")\n\n# Print the first",
+              "tool_call": "# Print the first few rows of the dataframe",
               "type": "tool_call"
             },
             "event_type": {
@@ -18587,7 +19801,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " few rows of the dataframe\nprint(df.head())\n\n#",
+              "tool_call": "\nprint(df.head())\n\n# Print the data types of",
               "type": "tool_call"
             },
             "event_type": {
@@ -18612,7 +19826,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " Print the data types of each column\nprint",
+              "tool_call": " each column\nprint(df.dtypes)\n\n# Print the summary",
               "type": "tool_call"
             },
             "event_type": {
@@ -18637,32 +19851,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "(df.dtypes)\n\n# Print the summary statistics of the dataframe",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "\nprint(df.describe())",
+              "tool_call": " statistics of the dataframe\nprint(df.describe())",
               "type": "tool_call"
             },
             "event_type": {
@@ -18689,9 +19878,9 @@
               },
               "tool_call": {
                 "arguments": {
-                  "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/8S20Zj2Oinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())"
+                  "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpmylybr76/IEQ51fUginflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())"
                 },
-                "call_id": "ea72d524-2d0f-4220-a898-4c295315235e",
+                "call_id": "1f1ed34a-bffb-459d-9f64-eb66d13b2aa5",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -21730,7 +22919,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " code will create a line plot of the average yearly inflation over time. The",
+              "text": " code will create a line plot of the average yearly inflation over",
               "type": "text"
             },
             "event_type": {
@@ -21750,7 +22939,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " x-axis represents the year, and the y-axis represents the average yearly inflation",
+              "text": " time. The x-axis represents the year, and the y-axis",
               "type": "text"
             },
             "event_type": {
@@ -21770,7 +22959,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": ". The plot will show the trend of average yearly inflation over the years",
+              "text": " represents the average yearly inflation. The plot will show the trend of",
               "type": "text"
             },
             "event_type": {
@@ -21790,7 +22979,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": ".",
+              "text": " average yearly inflation over the years.",
               "type": "text"
             },
             "event_type": {
@@ -21829,7 +23018,7 @@
             {
               "metric": "prompt_tokens",
               "unit": null,
-              "value": 635
+              "value": 631
             },
             {
               "metric": "completion_tokens",
@@ -21839,7 +23028,7 @@
             {
               "metric": "total_tokens",
               "unit": null,
-              "value": 691
+              "value": 687
             }
           ]
         }
@@ -21905,7 +23094,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "import pandas as pd\nimport matplotlib.pyplot as",
+              "tool_call": "import pandas as pd\nimport matplotlib",
               "type": "tool_call"
             },
             "event_type": {
@@ -21930,7 +23119,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " plt\n\n# Load data\ndf = pd.read_csv(\"/var/f",
+              "tool_call": ".pyplot as plt\n\n# Load data",
               "type": "tool_call"
             },
             "event_type": {
@@ -21955,7 +23144,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "olders/cz/vyh7y1d11xg881lsx",
+              "tool_call": "\ndf = pd.read_csv(\"/var/folders/c",
               "type": "tool_call"
             },
             "event_type": {
@@ -21980,7 +23169,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "sshnc5c0000gn/T/tmpflpgiagc/",
+              "tool_call": "z/vyh7y1d11xg881",
               "type": "tool_call"
             },
             "event_type": {
@@ -22005,7 +23194,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "2VkeqrPlinflation.csv\")\n\n# Calculate average yearly inflation",
+              "tool_call": "lsxsshnc5c0000gn/T/tmpmy",
               "type": "tool_call"
             },
             "event_type": {
@@ -22030,7 +23219,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\ndf['Average'] = df[['Jan', 'Feb', '",
+              "tool_call": "lybr76/Dhwctgpwinflation.csv\")\n\n#",
               "type": "tool_call"
             },
             "event_type": {
@@ -22055,7 +23244,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "Mar', 'Apr', 'May', 'Jun', 'Jul',",
+              "tool_call": " Calculate average yearly inflation\ndf['Average'] = df[['",
               "type": "tool_call"
             },
             "event_type": {
@@ -22080,7 +23269,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " 'Aug', 'Sep', 'Oct', 'Nov', 'Dec",
+              "tool_call": "Jan', 'Feb', 'Mar', 'Apr",
               "type": "tool_call"
             },
             "event_type": {
@@ -22105,7 +23294,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "']].mean(axis=1)\n\n# Plot time series\nplt.figure(figsize",
+              "tool_call": "', 'May', 'Jun', 'Jul', '",
               "type": "tool_call"
             },
             "event_type": {
@@ -22130,7 +23319,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "=(10,6))\nplt.plot(df['Year'], df['Average",
+              "tool_call": "Aug', 'Sep', 'Oct', 'Nov', 'Dec",
               "type": "tool_call"
             },
             "event_type": {
@@ -22155,7 +23344,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "'])\nplt.xlabel('Year')\nplt.ylabel('Average Yearly",
+              "tool_call": "']].mean(axis=1)\n\n# Plot time series\nplt",
               "type": "tool_call"
             },
             "event_type": {
@@ -22180,7 +23369,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " Inflation')\nplt.title('Average Yearly Inflation Over",
+              "tool_call": ".figure(figsize=(10,6))\n",
               "type": "tool_call"
             },
             "event_type": {
@@ -22205,7 +23394,82 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " Time')\nplt.grid(True)\nplt.show()",
+              "tool_call": "plt.plot(df['Year'], df['Average'])\nplt",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ".xlabel('Year')\nplt.ylabel('Average Yearly Inflation')\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "plt.title('Average Yearly Inflation Over Time')\nplt",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ".grid(True)\nplt.show()",
               "type": "tool_call"
             },
             "event_type": {
@@ -22232,9 +23496,9 @@
               },
               "tool_call": {
                 "arguments": {
-                  "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/2VkeqrPlinflation.csv\")\n\n# Calculate average yearly inflation\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\n\n# Plot time series\nplt.figure(figsize=(10,6))\nplt.plot(df['Year'], df['Average'])\nplt.xlabel('Year')\nplt.ylabel('Average Yearly Inflation')\nplt.title('Average Yearly Inflation Over Time')\nplt.grid(True)\nplt.show()"
+                  "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpmylybr76/Dhwctgpwinflation.csv\")\n\n# Calculate average yearly inflation\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\n\n# Plot time series\nplt.figure(figsize=(10,6))\nplt.plot(df['Year'], df['Average'])\nplt.xlabel('Year')\nplt.ylabel('Average Yearly Inflation')\nplt.title('Average Yearly Inflation Over Time')\nplt.grid(True)\nplt.show()"
                 },
-                "call_id": "f82fa3fd-e3be-4cb7-9298-8b4625cf709e",
+                "call_id": "73dbb112-a028-48fd-8664-a6c408d1f13d",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -22283,7 +23547,7 @@
             {
               "metric": "prompt_tokens",
               "unit": null,
-              "value": 454
+              "value": 452
             },
             {
               "metric": "completion_tokens",
@@ -22293,7 +23557,7 @@
             {
               "metric": "total_tokens",
               "unit": null,
-              "value": 464
+              "value": 462
             }
           ]
         }
@@ -24036,7 +25300,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " CSV file contains 10 rows and 13 columns. The columns are",
+              "text": " CSV file contains 10 rows and 13 columns. The columns",
               "type": "text"
             },
             "event_type": {
@@ -24056,7 +25320,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " named 'Year', 'Jan', 'Feb', 'Mar', '",
+              "text": " are named 'Year', 'Jan', 'Feb', 'Mar",
               "type": "text"
             },
             "event_type": {
@@ -24076,7 +25340,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "Apr', 'May', 'Jun', 'Jul', 'Aug',",
+              "text": "', 'Apr', 'May', 'Jun', 'Jul',",
               "type": "text"
             },
             "event_type": {
@@ -24096,7 +25360,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " 'Sep', 'Oct', 'Nov', 'Dec'. The data",
+              "text": " 'Aug', 'Sep', 'Oct',",
               "type": "text"
             },
             "event_type": {
@@ -24116,7 +25380,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " types of these columns are int64 for 'Year",
+              "text": " 'Nov', 'Dec'. The",
               "type": "text"
             },
             "event_type": {
@@ -24136,7 +25400,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "' and float64 for the rest.\n\nIt appears that this CSV file",
+              "text": " data types of these columns are int64 for",
               "type": "text"
             },
             "event_type": {
@@ -24156,7 +25420,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " contains monthly inflation rates for different years. The 'Year' column represents",
+              "text": " 'Year' and float64 for the rest",
               "type": "text"
             },
             "event_type": {
@@ -24176,7 +25440,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " the year, and the rest of the columns represent the inflation rates",
+              "text": ".\n\nIt appears that this CSV file contains monthly inflation rates for",
               "type": "text"
             },
             "event_type": {
@@ -24196,7 +25460,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " for each month of the",
+              "text": " different years. The 'Year' column represents the year,",
               "type": "text"
             },
             "event_type": {
@@ -24216,7 +25480,27 @@
         "data": {
           "event": {
             "delta": {
-              "text": " year.",
+              "text": " and the rest of the columns represent the inflation rates for each",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " month of the year.",
               "type": "text"
             },
             "event_type": {
@@ -24255,7 +25539,7 @@
             {
               "metric": "prompt_tokens",
               "unit": null,
-              "value": 327
+              "value": 325
             },
             {
               "metric": "completion_tokens",
@@ -24265,7 +25549,7 @@
             {
               "metric": "total_tokens",
               "unit": null,
-              "value": 452
+              "value": 450
             }
           ]
         }
@@ -24356,7 +25640,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "_csv(\"/var/folders/cz/vyh7",
+              "tool_call": "_csv(\"/var/folders/cz/vyh7y1",
               "type": "tool_call"
             },
             "event_type": {
@@ -24381,7 +25665,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "y1d11xg881lsxsshnc5c000",
+              "tool_call": "d11xg881lsxsshnc",
               "type": "tool_call"
             },
             "event_type": {
@@ -24406,7 +25690,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "0gn/T/tmpflpgiagc/2VkeqrPlinflation",
+              "tool_call": "5c0000gn/T/tmpmyly",
               "type": "tool_call"
             },
             "event_type": {
@@ -24431,7 +25715,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": ".csv\")\n# Rows\nprint(\"Number of rows and columns in",
+              "tool_call": "br76/Dhwctgpwinflation.csv\")\n#",
               "type": "tool_call"
             },
             "event_type": {
@@ -24456,7 +25740,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are",
+              "tool_call": " Rows\nprint(\"Number of rows and columns in the data:\",",
               "type": "tool_call"
             },
             "event_type": {
@@ -24481,7 +25765,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": ":\", len(df.columns))\n# Column names\nprint(\"Columns of the data",
+              "tool_call": " df.shape)\n# Columns\nprint(\"Columns of the data are",
               "type": "tool_call"
             },
             "event_type": {
@@ -24506,7 +25790,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of",
+              "tool_call": ":\", len(df.columns))\n# Column names\nprint(\"Columns of",
               "type": "tool_call"
             },
             "event_type": {
@@ -24531,7 +25815,57 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " the columns are:\", df.dtypes)",
+              "tool_call": " the data are:\", df.columns)\n# Column dt",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "ypes\nprint(\"Datatype of the columns",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " are:\", df.dtypes)",
               "type": "tool_call"
             },
             "event_type": {
@@ -24558,9 +25892,9 @@
               },
               "tool_call": {
                 "arguments": {
-                  "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/2VkeqrPlinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)"
+                  "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpmylybr76/Dhwctgpwinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)"
                 },
-                "call_id": "b8aab119-7997-428e-81ab-e6aa163f7acc",
+                "call_id": "f1d86c1d-75bd-43f3-9117-a906e41598f8",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -29279,6 +30613,1666 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8c735\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:ef2c1\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:4857b\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:ef2c1\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:4857b\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:ef2c1\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:ef2c1\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:ef2c1\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:ef2c1\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:ef2c1\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "To",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use LoRA in Torchtune,",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " you can follow these steps:\n\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "1.  Install Torchtune and",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " its dependencies.\n2.  Download",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the Llama2 weights and",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " tokenizer.\n3. ",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Use the `lora_llama2_7",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "b` model in Torchtune, which applies",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " LoRA to the Q and V projections by default.\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "4.  Load the base model weights into the LoRA",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " model without any conversion necessary.\n5.  Set only Lo",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "RA parameters to trainable.\n6.  Run the LoRA",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " finetuning recipe in Torchtune",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " with the desired configuration.\n\nYou can also experiment",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " with different LoRA configurations, such as applying LoRA to",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " all linear layers in the self-attention,",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " increasing the rank, or scaling",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " alpha and rank together.\n\nNote that LoRA can be",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " beneficial for reducing memory usage during fine-tuning",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ", but it may also impact model performance. You can trade",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " off memory and model performance by adjusting the LoRA configuration and running",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " experiments with different settings.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 158
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 212
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 370
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8c735\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:ef2c1\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:4857b\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:ef2c1\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:4857b\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"knowledge",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "_search\", \"parameters\": {\"query\":",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " \"How to use LoRA in Tor",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "chtune\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "How to use LoRA in Torchtune"
+                },
+                "call_id": "8414f84a-98b1-41eb-90bd-bce084da79eb",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 117
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 40
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 157
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8c735\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:ef2c1\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:4857b\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:ef2c1\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:4857b\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'m ready to help you answer questions about Tor",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "chtune based on the documentation you provided.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " What's your first question?",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 75
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 35
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 110
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:9050a\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:c4e00\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:15efa\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:c4e00\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:15efa\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:c4e00\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:c4e00\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:c4e00\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:c4e00\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:c4e00\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "To",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use LoRA in Torchtune, you can follow these",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " steps:\n\n1.  Install Torchtune and its dependencies",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".\n2.  Download the Llama2",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " weights and tokenizer.\n3.  Use the `lora",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "_llama2_7b` model in",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Torchtune, which applies LoRA to the Q and V",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " projections by default.\n4.  Load the base model weights into",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the LoRA model without any conversion necessary.\n5.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "  Set only LoRA parameters to trainable.\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "6.  Run the LoRA finetuning recipe in",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Torchtune with the desired configuration.\n\nYou",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " can also experiment with different LoRA configurations, such as applying Lo",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "RA to all linear layers in the self-attention",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ", increasing the rank, or scaling alpha and",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " rank together.\n\nNote that LoRA can be beneficial",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " for reducing memory usage during fine-tuning,",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " but it may also impact model",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " performance. You can trade off memory and model performance by",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " adjusting the LoRA configuration and running experiments with different settings.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 158
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 212
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 370
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:9050a\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:c4e00\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:15efa\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:c4e00\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:15efa\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "knowledge_search\", \"parameters\": {\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "query\": \"How to use LoRA in",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Torchtune\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "How to use LoRA in Torchtune"
+                },
+                "call_id": "0784780b-c3dc-4f4a-a37f-e75e83e9be61",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 117
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 40
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 157
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:9050a\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:c4e00\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:15efa\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:c4e00\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:15efa\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'m ready to help you answer questions about Torchtune based on the documentation",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " you provided. What's your first question?",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 75
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 35
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 110
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a4c57\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:46132\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:392a8\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:46132\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:392a8\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:46132\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:46132\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:46132\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:46132\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:46132\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
     "chunks": [
       {
@@ -33769,7 +36763,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"",
+              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
               "type": "tool_call"
             },
             "event_type": {
@@ -33794,32 +36788,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "knowledge_search\", \"parameters\": {\"query\": \"Tor",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "chtune documentation\"}}",
+              "tool_call": "\": {\"query\": \"Torchtune documentation\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -33848,7 +36817,7 @@
                 "arguments": {
                   "query": "Torchtune documentation"
                 },
-                "call_id": "cf722fb9-6067-46ea-8534-852b7d364278",
+                "call_id": "7c426640-e3ba-4f25-8c9e-bf9feb88718a",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -34209,7 +37178,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " attention type used by Llama3-8B",
+              "text": " attention type used by Llama3-8B is grouped",
               "type": "text"
             },
             "event_type": {
@@ -34229,7 +37198,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " is grouped-query attention.",
+              "text": "-query attention.",
               "type": "text"
             },
             "event_type": {
@@ -34334,7 +37303,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " attention type used by Llama3-8B",
+              "text": " attention type used by Llama3-8",
               "type": "text"
             },
             "event_type": {
@@ -34354,7 +37323,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " is grouped-query attention.",
+              "text": "B is grouped-query attention.",
               "type": "text"
             },
             "event_type": {
@@ -34459,7 +37428,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "    \"type\": \"function\",\n    \"name\": \"knowledge_search\",\n",
+              "text": "    \"type\": \"function\",\n    \"",
               "type": "text"
             },
             "event_type": {
@@ -34479,7 +37448,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "    \"parameters\": {\n        \"query\": \"L",
+              "text": "name\": \"knowledge_search\",\n    \"parameters\": {\n        \"",
               "type": "text"
             },
             "event_type": {
@@ -34499,7 +37468,27 @@
         "data": {
           "event": {
             "delta": {
-              "text": "lama3-8B attention type\"\n    }\n}",
+              "text": "query\": \"Llama3-8B",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " attention type\"\n    }\n}",
               "type": "text"
             },
             "event_type": {
@@ -34528,7 +37517,7 @@
                 "arguments": {
                   "query": "Llama3-8B attention type"
                 },
-                "call_id": "9106bccf-d0c5-4b0a-9398-0b5972ada295",
+                "call_id": "0a634543-9512-4a3c-b665-3b077996acab",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -34649,7 +37638,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",",
+              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
               "type": "tool_call"
             },
             "event_type": {
@@ -34674,32 +37663,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " \"parameters\": {\"query\": \"Llama3-8",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "B attention type\"}}",
+              "tool_call": "\": {\"query\": \"Llama3-8B attention type\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -34728,7 +37692,7 @@
                 "arguments": {
                   "query": "Llama3-8B attention type"
                 },
-                "call_id": "768fe977-8297-42bd-90c3-b1dc07882ce0",
+                "call_id": "f6cf7afb-20b1-472b-983e-1281fdf6e5ca",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -35518,6 +38482,111 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the founder of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Meta founder\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"{\\\"query\\\": \\\"Meta founder\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.81595254, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\\\", \\\"score\\\": 0.46759978, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Meta leadership: images of senior executives for download to use in articles about the company. ... Mark Zuckerberg, Founder, Chairman and Chief Executive Officer. Nick Clegg, President, Global Affairs. Joel Kaplan, Chief Global Affairs Officer. Susan Li, Chief Financial Officer.\\\", \\\"score\\\": 0.46482924, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta Platforms - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Meta_Platforms\\\", \\\"content\\\": \\\"Following a period of intense scrutiny and damaging whistleblower leaks, news started to emerge on October 21, 2021, about Facebook's plan to rebrand the company and change its name.[15][54] In the Q3 2021 Earnings Call on October 25, Mark Zuckerberg discussed the ongoing criticism of the company's social services and the way it operates, and pointed to the pivoting efforts to building the metaverse \\\\u2013 without mentioning the rebranding and the name change.[55] The metaverse vision and the name change from Facebook, Inc. to Meta Platforms was introduced at Facebook Connect on October 28, 2021.[16] Based on Facebook's PR campaign, the name change reflects the company's shifting long term focus of building the metaverse, a digital extension of the physical world by social media, virtual reality and augmented reality features.[16][56]\\\", \\\"score\\\": 0.14999175, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.036911618, \\\"raw_content\\\": null}]}\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " founder of Meta is Mark Zuckerberg.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 1101
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 18
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 1119
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the founder of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
     "chunks": [
       {
@@ -35576,32 +38645,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "brave_search.call(query",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "=\"Meta founder\")",
+              "tool_call": "brave_search.call(query=\"Meta founder\")",
               "type": "tool_call"
             },
             "event_type": {
@@ -35630,7 +38674,7 @@
                 "arguments": {
                   "query": "Meta founder"
                 },
-                "call_id": "b81c41ae-5eb7-41b7-b466-78eb25a91bb7",
+                "call_id": "a9a452ac-a1a1-4414-a107-4cdc283f4129",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -36172,6 +39216,191 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " function `get_boiling_point`",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " is not able to find the boiling point",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " of polyjuice as it is a",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " fictional liquid from the Harry Potter series. The function is only able",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " to find the boiling point of real liquids.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 70
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 56
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 126
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -36357,6 +39586,151 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " function `get_boiling_point` is not able to find the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " boiling point of polyjuice as it is",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " not a real liquid.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 70
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 38
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 108
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
     "chunks": [
       {
@@ -36482,6 +39856,151 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " function `get_boiling_point` is not able to find",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the boiling point of polyjuice as it is",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " not a real liquid.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 70
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 38
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 108
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -36627,6 +40146,206 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"polyjuice",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "918c5630-abc9-4500-ac0b-b630e0743561",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 30
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 40
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -36802,6 +40521,206 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_bo",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "iling_point\", \"parameters\": {\"liquid_name\": \"polyjuice",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "364ad4a8-2e6e-4afb-8c81-1cf98774758a",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 30
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 40
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
     "chunks": [
       {
@@ -37002,6 +40921,231 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"none\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " couldn't find any information on",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the boiling point of Polyjuice.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Polyjuice is a magical potion in the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Harry Potter series that allows the drinker to transform into",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " someone else. It's not a physical substance",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " with a boiling point. If you have any other questions, I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'d be happy to help.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 30
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 73
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 103
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"none\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -37207,6 +41351,206 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\": \"get_boiling_point\", \"parameters\": {\"liquid",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "_name\": \"polyjuice\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "b41fafca-4559-4a0a-b49b-f4edf893d08a",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 30
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 40
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -37764,7 +42108,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "def is_prime(n):\n    if n <= 1:\n        return False",
+              "tool_call": "def is_prime(n):\n    if n <= 1:\n",
               "type": "tool_call"
             },
             "event_type": {
@@ -37789,7 +42133,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\n    if n <= 3:\n       ",
+              "tool_call": "        return False\n    if n <= 3:\n        return",
               "type": "tool_call"
             },
             "event_type": {
@@ -37814,7 +42158,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " return True\n    if n % 2 ==",
+              "tool_call": " True\n    if n % 2 == 0 or",
               "type": "tool_call"
             },
             "event_type": {
@@ -37839,7 +42183,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " 0 or n % 3 == 0:\n        return False",
+              "tool_call": " n % 3 == 0",
               "type": "tool_call"
             },
             "event_type": {
@@ -37864,7 +42208,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\n    i = 5\n    while i * i <= n:\n       ",
+              "tool_call": ":\n        return False\n    i = 5\n    while",
               "type": "tool_call"
             },
             "event_type": {
@@ -37889,7 +42233,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " if n % i == 0 or n % (i + 2",
+              "tool_call": " i * i <= n:\n        if n % i == ",
               "type": "tool_call"
             },
             "event_type": {
@@ -37914,7 +42258,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": ") == 0:\n            return False\n        i += 6\n   ",
+              "tool_call": "0 or n % (i + 2) == ",
               "type": "tool_call"
             },
             "event_type": {
@@ -37939,7 +42283,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " return True\n\ndef get_nth_prime(n):\n    count = 0\n   ",
+              "tool_call": "0:\n            return False\n        i",
               "type": "tool_call"
             },
             "event_type": {
@@ -37964,7 +42308,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " num = 2\n    while True:\n        if is_prime(num):\n           ",
+              "tool_call": " += 6\n    return True\n\ndef get_nth_prime(n",
               "type": "tool_call"
             },
             "event_type": {
@@ -37989,7 +42333,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " count += 1\n            if count == n:\n                return num\n",
+              "tool_call": "):\n    count = 0\n",
               "type": "tool_call"
             },
             "event_type": {
@@ -38014,7 +42358,107 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "        num += 1\n\nprint(get_nth_prime(100))",
+              "tool_call": "    num = 2\n    while",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " True:\n        if is_prime(num):\n            count += 1",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\n            if count == n:\n                return num\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "        num += 1\n\nprint(get_nth_prime",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "(100))",
               "type": "tool_call"
             },
             "event_type": {
@@ -38043,7 +42487,7 @@
                 "arguments": {
                   "code": "def is_prime(n):\n    if n <= 1:\n        return False\n    if n <= 3:\n        return True\n    if n % 2 == 0 or n % 3 == 0:\n        return False\n    i = 5\n    while i * i <= n:\n        if n % i == 0 or n % (i + 2) == 0:\n            return False\n        i += 6\n    return True\n\ndef get_nth_prime(n):\n    count = 0\n    num = 2\n    while True:\n        if is_prime(num):\n            count += 1\n            if count == n:\n                return num\n        num += 1\n\nprint(get_nth_prime(100))"
                 },
-                "call_id": "d8ece88b-7b3e-4f72-9555-5a928c27012c",
+                "call_id": "a1296d7e-6ca3-4056-b43f-19a9663e8bcb",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -38548,7 +42992,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
+              "text": "type\": \"function\", \"name\": \"knowledge_search\", \"",
               "type": "text"
             },
             "event_type": {
@@ -38568,7 +43012,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "\": {\"query\": \"Perplexity company founding",
+              "text": "parameters\": {\"query\": \"Perplexity",
               "type": "text"
             },
             "event_type": {
@@ -38588,7 +43032,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " date\"}}",
+              "text": " company founding date\"}}",
               "type": "text"
             },
             "event_type": {
@@ -38617,7 +43061,7 @@
                 "arguments": {
                   "query": "Perplexity company founding date"
                 },
-                "call_id": "7f40db23-2182-4006-9234-4c5b7dac978f",
+                "call_id": "75b712aa-fdeb-48bb-be40-c7fcd06242b6",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -38738,7 +43182,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
+              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",",
               "type": "tool_call"
             },
             "event_type": {
@@ -38763,7 +43207,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\": {\"query\": \"Perplexity company founding date\"}}",
+              "tool_call": " \"parameters\": {\"query\": \"Perplexity company founding date\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -38792,7 +43236,7 @@
                 "arguments": {
                   "query": "Perplexity company founding date"
                 },
-                "call_id": "7f65affe-6ecb-4db5-b70f-71e05e28c310",
+                "call_id": "3d505e8e-fe35-486e-9661-27f67702621d",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -39177,7 +43621,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " NBA was created on August 3,",
+              "text": " NBA was created on August 3, 1949, with",
               "type": "text"
             },
             "event_type": {
@@ -39197,7 +43641,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " 1949, with the merger of the Basketball Association of",
+              "text": " the merger of the Basketball Association of America (",
               "type": "text"
             },
             "event_type": {
@@ -39217,7 +43661,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " America (BAA) and the National Basketball League (NBL",
+              "text": "BAA) and the National Basketball League (NBL",
               "type": "text"
             },
             "event_type": {
@@ -39352,7 +43796,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"",
+              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search",
               "type": "tool_call"
             },
             "event_type": {
@@ -39377,7 +43821,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "knowledge_search\", \"parameters\": {\"query\": \"when",
+              "tool_call": "\", \"parameters\": {\"query\": \"when was the n",
               "type": "tool_call"
             },
             "event_type": {
@@ -39402,7 +43846,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " was the nba created\"}}",
+              "tool_call": "ba created\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -39431,7 +43875,7 @@
                 "arguments": {
                   "query": "when was the nba created"
                 },
-                "call_id": "0f4d0151-e44c-443a-8101-e0ac92c9d45f",
+                "call_id": "03ce919a-d1b5-4120-896e-433e79910757",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.json b/tests/integration/fixtures/recorded_responses/invoke_tool.json
index 8db8ad966..f3a2cfbcb 100644
--- a/tests/integration/fixtures/recorded_responses/invoke_tool.json
+++ b/tests/integration/fixtures/recorded_responses/invoke_tool.json
@@ -167,23 +167,23 @@
             "type": "text"
           },
           {
-            "text": "Result 1:\nDocument_id:15b86\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+            "text": "Result 1:\nDocument_id:c4e00\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
             "type": "text"
           },
           {
-            "text": "Result 2:\nDocument_id:15b86\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
+            "text": "Result 2:\nDocument_id:c4e00\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
             "type": "text"
           },
           {
-            "text": "Result 3:\nDocument_id:15b86\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+            "text": "Result 3:\nDocument_id:c4e00\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
             "type": "text"
           },
           {
-            "text": "Result 4:\nDocument_id:15b86\nContent:  from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n  # Assuming that base_model already has the pretrained Llama2 weights,\n  # this will directly load them into your LoRA model without any conversion necessary.\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n  # Fetch all params from the model that are associated with LoRA.\n  lora_params = get_adapter_params(lora_model)\n\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n  set_trainable_params(lora_model, lora_params)\n\n  # Print the total number of parameters\n  total_params = sum([p.numel() for p in lora_model.parameters()])\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n  print(\n    f\"\"\"\n    {total_params} total params,\n    {trainable_params}\" trainable params,\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n    \"\"\"\n  )\n\n  6742609920 total params,\n  4194304 trainable params,\n  0.06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
+            "text": "Result 4:\nDocument_id:c4e00\nContent:  from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n  # Assuming that base_model already has the pretrained Llama2 weights,\n  # this will directly load them into your LoRA model without any conversion necessary.\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n  # Fetch all params from the model that are associated with LoRA.\n  lora_params = get_adapter_params(lora_model)\n\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n  set_trainable_params(lora_model, lora_params)\n\n  # Print the total number of parameters\n  total_params = sum([p.numel() for p in lora_model.parameters()])\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n  print(\n    f\"\"\"\n    {total_params} total params,\n    {trainable_params}\" trainable params,\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n    \"\"\"\n  )\n\n  6742609920 total params,\n  4194304 trainable params,\n  0.06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
             "type": "text"
           },
           {
-            "text": "Result 5:\nDocument_id:15b86\nContent: ,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
+            "text": "Result 5:\nDocument_id:c4e00\nContent: ,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
             "type": "text"
           },
           {
@@ -195,11 +195,11 @@
         "error_message": null,
         "metadata": {
           "document_ids": [
-            "15b8638f-b1b6-4f58-adfa-eb6644c47de3",
-            "15b8638f-b1b6-4f58-adfa-eb6644c47de3",
-            "15b8638f-b1b6-4f58-adfa-eb6644c47de3",
-            "15b8638f-b1b6-4f58-adfa-eb6644c47de3",
-            "15b8638f-b1b6-4f58-adfa-eb6644c47de3"
+            "c4e00391-aeb8-4d32-ac41-ae3242f38a19",
+            "c4e00391-aeb8-4d32-ac41-ae3242f38a19",
+            "c4e00391-aeb8-4d32-ac41-ae3242f38a19",
+            "c4e00391-aeb8-4d32-ac41-ae3242f38a19",
+            "c4e00391-aeb8-4d32-ac41-ae3242f38a19"
           ]
         }
       }
@@ -261,7 +261,7 @@
       "__module__": "llama_stack.apis.tools.tools",
       "__pydantic__": "ToolInvocationResult",
       "data": {
-        "content": "{\"query\": \"Meta founder\", \"top_k\": [{\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.81595254, \"raw_content\": null}, {\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\u2018Boz\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\", \"score\": 0.70726365, \"raw_content\": null}, {\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\", \"score\": 0.467308, \"raw_content\": null}, {\"title\": \"Meta Platforms - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Meta_Platforms\", \"content\": \"Following a period of intense scrutiny and damaging whistleblower leaks, news started to emerge on October 21, 2021, about Facebook's plan to rebrand the company and change its name.[15][54] In the Q3 2021 Earnings Call on October 25, Mark Zuckerberg discussed the ongoing criticism of the company's social services and the way it operates, and pointed to the pivoting efforts to building the metaverse \\u2013 without mentioning the rebranding and the name change.[55] The metaverse vision and the name change from Facebook, Inc. to Meta Platforms was introduced at Facebook Connect on October 28, 2021.[16] Based on Facebook's PR campaign, the name change reflects the company's shifting long term focus of building the metaverse, a digital extension of the physical world by social media, virtual reality and augmented reality features.[16][56]\", \"score\": 0.14999175, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.03678684, \"raw_content\": null}]}",
+        "content": "{\"query\": \"Meta founder\", \"top_k\": [{\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.81595254, \"raw_content\": null}, {\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\", \"score\": 0.46759978, \"raw_content\": null}, {\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Meta leadership: images of senior executives for download to use in articles about the company. ... Mark Zuckerberg, Founder, Chairman and Chief Executive Officer. Nick Clegg, President, Global Affairs. Joel Kaplan, Chief Global Affairs Officer. Susan Li, Chief Financial Officer.\", \"score\": 0.46482924, \"raw_content\": null}, {\"title\": \"Meta Platforms - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Meta_Platforms\", \"content\": \"Following a period of intense scrutiny and damaging whistleblower leaks, news started to emerge on October 21, 2021, about Facebook's plan to rebrand the company and change its name.[15][54] In the Q3 2021 Earnings Call on October 25, Mark Zuckerberg discussed the ongoing criticism of the company's social services and the way it operates, and pointed to the pivoting efforts to building the metaverse \\u2013 without mentioning the rebranding and the name change.[55] The metaverse vision and the name change from Facebook, Inc. to Meta Platforms was introduced at Facebook Connect on October 28, 2021.[16] Based on Facebook's PR campaign, the name change reflects the company's shifting long term focus of building the metaverse, a digital extension of the physical world by social media, virtual reality and augmented reality features.[16][56]\", \"score\": 0.14999175, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.036911618, \"raw_content\": null}]}",
         "error_code": null,
         "error_message": null,
         "metadata": null
@@ -400,23 +400,23 @@
             "type": "text"
           },
           {
-            "text": "Result 1:\nDocument_id:bbddb\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
+            "text": "Result 1:\nDocument_id:9050a\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
             "type": "text"
           },
           {
-            "text": "Result 2:\nDocument_id:15b86\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+            "text": "Result 2:\nDocument_id:c4e00\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
             "type": "text"
           },
           {
-            "text": "Result 3:\nDocument_id:83901\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+            "text": "Result 3:\nDocument_id:15efa\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
             "type": "text"
           },
           {
-            "text": "Result 4:\nDocument_id:15b86\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+            "text": "Result 4:\nDocument_id:c4e00\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
             "type": "text"
           },
           {
-            "text": "Result 5:\nDocument_id:83901\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+            "text": "Result 5:\nDocument_id:15efa\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
             "type": "text"
           },
           {
@@ -428,11 +428,11 @@
         "error_message": null,
         "metadata": {
           "document_ids": [
-            "bbddbe62-508d-4c8d-9455-3b60bc2825a5",
-            "15b8638f-b1b6-4f58-adfa-eb6644c47de3",
-            "83901b53-33d4-4f5e-8145-b94c783e9f61",
-            "15b8638f-b1b6-4f58-adfa-eb6644c47de3",
-            "83901b53-33d4-4f5e-8145-b94c783e9f61"
+            "9050ae1c-eba1-4846-b550-2db1957fee7d",
+            "c4e00391-aeb8-4d32-ac41-ae3242f38a19",
+            "15efa3d7-f804-4d31-ab05-a5524d82b96a",
+            "c4e00391-aeb8-4d32-ac41-ae3242f38a19",
+            "15efa3d7-f804-4d31-ab05-a5524d82b96a"
           ]
         }
       }

From edfcb02a0e723931dcdb464d6e9bf6856108e502 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Thu, 13 Mar 2025 20:04:53 +0100
Subject: [PATCH 17/38] ci(ollama): add GitHub Actions workflow for integration
 tests (#1546)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

Added a GitHub Action to run inference tests for the Ollama provider.
This ensures we have coverage for Ollama integration.

---------

Signed-off-by: Sébastien Han <seb@redhat.com>
Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
---
 .github/workflows/integration-tests.yml | 80 +++++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 .github/workflows/integration-tests.yml

diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
new file mode 100644
index 000000000..af268e728
--- /dev/null
+++ b/.github/workflows/integration-tests.yml
@@ -0,0 +1,80 @@
+name: Integration tests
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+jobs:
+  ollama:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          python-version: "3.10"
+
+      - name: Install Ollama
+        run: |
+          curl -fsSL https://ollama.com/install.sh | sh
+
+      - name: Pull Ollama image
+        run: |
+          ollama pull llama3.2:3b-instruct-fp16
+
+      - name: Start Ollama in background
+        run: |
+          nohup ollama run llama3.2:3b-instruct-fp16 > ollama.log 2>&1 &
+
+      - name: Set Up Environment and Install Dependencies
+        run: |
+          uv sync --extra dev --extra test
+          uv pip install ollama faiss-cpu
+          uv pip install -e .
+
+      - name: Wait for Ollama to start
+        run: |
+          echo "Waiting for Ollama..."
+          for i in {1..30}; do
+            if curl -s http://localhost:11434 | grep -q "Ollama is running"; then
+              echo "Ollama is running!"
+              exit 0
+            fi
+            sleep 1
+          done
+          echo "Ollama failed to start"
+          ollama ps
+          ollama.log
+          exit 1
+
+      - name: Start Llama Stack server in background
+        env:
+          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+        run: |
+          source .venv/bin/activate
+          # TODO: use "llama stack run"
+          nohup uv run python -m llama_stack.distribution.server.server --yaml-config ./llama_stack/templates/ollama/run.yaml > server.log 2>&1 &
+
+      - name: Wait for Llama Stack server to be ready
+        run: |
+          echo "Waiting for Llama Stack server..."
+          for i in {1..30}; do
+            if curl -s http://localhost:8321/v1/health | grep -q "OK"; then
+              echo " Llama Stack server is up!"
+              exit 0
+            fi
+            sleep 1
+          done
+          echo " Llama Stack server failed to start"
+          cat server.log
+          exit 1
+
+      - name: Run Inference Integration Tests
+        env:
+          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+        run: |
+          uv run pytest -v tests/integration/inference --stack-config=ollama --text-model="meta-llama/Llama-3.2-3B-Instruct" --embedding-model=all-MiniLM-L6-v2

From 28aade9a277b039024015914d7890b9c2cad1ce1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Thu, 13 Mar 2025 20:09:04 +0100
Subject: [PATCH 18/38] ci: add GitHub Action to close stale issues and PRs
 (#1613)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

- Issues/PRs inactive for 60 days are marked as stale
- Stale items are closed after 30 additional days of inactivity
- Adds appropriate warning and closing messages
- Sets daily schedule for stale checks

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 .github/workflows/stale_bot.yml | 45 +++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 .github/workflows/stale_bot.yml

diff --git a/.github/workflows/stale_bot.yml b/.github/workflows/stale_bot.yml
new file mode 100644
index 000000000..2039fcbb4
--- /dev/null
+++ b/.github/workflows/stale_bot.yml
@@ -0,0 +1,45 @@
+name: Close stale issues and PRs
+
+on:
+  schedule:
+    - cron: '0 0 * * *' # every day at midnight
+
+env:
+  LC_ALL: en_US.UTF-8
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: read
+
+jobs:
+  stale:
+    permissions:
+      issues: write
+      pull-requests: write
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stale Action
+        uses: actions/stale@v9
+        with:
+          stale-issue-label: 'stale'
+          stale-issue-message: >
+            This issue has been automatically marked as stale because it has not had activity within 60 days.
+            It will be automatically closed if no further activity occurs within 30 days.
+          close-issue-message: >
+            This issue has been automatically closed due to inactivity.
+            Please feel free to reopen if you feel it is still relevant!
+          days-before-issue-stale: 60
+          days-before-issue-close: 30
+          stale-pr-label: 'stale'
+          stale-pr-message: >
+            This pull request has been automatically marked as stale because it has not had activity within 60 days.
+            It will be automatically closed if no further activity occurs within 30 days.
+          close-pr-message: >
+            This pull request has been automatically closed due to inactivity.
+            Please feel free to reopen if you intend to continue working on it!
+          days-before-pr-stale: 60
+          days-before-pr-close: 30
+          operations-per-run: 300

From a1bb7c8d82ff7575f9861bf1ab0f4933822b6fe9 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Thu, 13 Mar 2025 15:47:58 -0400
Subject: [PATCH 19/38] docs: Add OpenAI, Anthropic, Gemini to API providers
 table (#1617)

# What does this PR do?

These are supported via
https://github.com/meta-llama/llama-stack/pull/1267.

cc @ashwinb

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 6e1fd088e..aade9c15f 100644
--- a/README.md
+++ b/README.md
@@ -51,6 +51,10 @@ Here is a list of the various API providers and available distributions that can
 |        PG Vector         |      Single Node       |            |               |     ✅      |            |               |
 |    PyTorch ExecuTorch    |     On-device iOS      |     ✅      |       ✅       |            |            |               |
 |           vLLM           | Hosted and Single Node |            |       ✅       |            |            |               |
+|          OpenAI          |         Hosted         |            |       ✅       |            |            |               |
+|        Anthropic         |         Hosted         |            |       ✅       |            |            |               |
+|          Gemini          |         Hosted         |            |       ✅       |            |            |               |
+
 
 ### Distributions
 

From ed841380dc34eb7974be1fd8edad821d407c53ab Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Thu, 13 Mar 2025 13:18:08 -0700
Subject: [PATCH 20/38] test: turn off recordable mock for now (#1616)

Summary:
will figure out how to do this best, turning it off for now.

Test Plan:
test_agents.py
---
[//]: # (BEGIN SAPLING FOOTER)
Stack created with [Sapling](https://sapling-scm.com). Best reviewed
with
[ReviewStack](https://reviewstack.dev/meta-llama/llama-stack/pull/1616).
* __->__ #1616
* #1615
---
 tests/integration/fixtures/common.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/integration/fixtures/common.py b/tests/integration/fixtures/common.py
index e410039e7..1878c9e88 100644
--- a/tests/integration/fixtures/common.py
+++ b/tests/integration/fixtures/common.py
@@ -52,6 +52,8 @@ def llama_stack_client_with_mocked_inference(llama_stack_client, request):
 
     If --record-responses is passed, it will call the real APIs and record the responses.
     """
+    # TODO: will rework this to be more stable
+    return llama_stack_client
     if not isinstance(llama_stack_client, LlamaStackAsLibraryClient):
         logging.warning(
             "llama_stack_client_with_mocked_inference is not supported for this client, returning original client without mocking"

From a3d710e59cf469ac8eba136d98a6afd15db4e50e Mon Sep 17 00:00:00 2001
From: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
Date: Thu, 13 Mar 2025 16:19:44 -0400
Subject: [PATCH 21/38] chore: Always check that git merge conflict markers are
 not present (#1610)

# What does this PR do?

Before the change, it was only doing it during the merge.

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan

```
$ git checkout d263edbf90f958349c7b9adea8fd4e5181932a69
$ pre-commit run --all-files
check for merge conflicts................................................Failed
- hook id: check-merge-conflict
- exit code: 1

docs/_static/llama-stack-spec.yaml:3179: Merge conflict string '<<<<<<<' found
docs/_static/llama-stack-spec.yaml:3185: Merge conflict string '=======' found
docs/_static/llama-stack-spec.yaml:3190: Merge conflict string '>>>>>>>' found
[...]
```

[//]: # (## Documentation)

Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
---
 .pre-commit-config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 926ae21cc..80a303b09 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,6 +8,7 @@ repos:
     rev: v5.0.0  # Latest stable version
     hooks:
     -   id: check-merge-conflict
+        args: ['--assume-in-merge']
     -   id: trailing-whitespace
         exclude: '\.py$'  # Exclude Python files as Ruff already handles them
     -   id: check-added-large-files

From e101d15f12dba3a31f35762a4da66f8cecbc7579 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 13 Mar 2025 16:40:15 -0400
Subject: [PATCH 22/38] build(deps): bump astral-sh/setup-uv from 4 to 5
 (#1620)

---
 .github/workflows/integration-tests.yml | 2 +-
 .github/workflows/providers-build.yml   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
index af268e728..bae5188fa 100644
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@@ -14,7 +14,7 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Install uv
-        uses: astral-sh/setup-uv@v4
+        uses: astral-sh/setup-uv@v5
         with:
           python-version: "3.10"
 
diff --git a/.github/workflows/providers-build.yml b/.github/workflows/providers-build.yml
index 1bf6591dc..be4298a98 100644
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@@ -51,7 +51,7 @@ jobs:
           python-version: '3.10'
 
       - name: Install uv
-        uses: astral-sh/setup-uv@v4
+        uses: astral-sh/setup-uv@v5
         with:
           python-version: "3.10"
 

From a062723d031fa715dd2f1e98ef299e1845ad0a41 Mon Sep 17 00:00:00 2001
From: Charlie Doern <cdoern@redhat.com>
Date: Thu, 13 Mar 2025 18:07:21 -0400
Subject: [PATCH 23/38] feat: add provider API for listing and inspecting
 provider info (#1429)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

currently the `inspect` API for providers is really a `list` API. Create
a new `providers` API which has a GET `providers/{provider_id}` inspect
API
which returns "user friendly" configuration to the end user. Also add a
GET `/providers` endpoint which returns the list of providers as
`inspect/providers` does today.

This API follows CRUD and is more intuitive/RESTful.

This work is part of the RFC at
https://github.com/meta-llama/llama-stack/pull/1359

sensitive fields are redacted using `redact_sensetive_fields` on the
server side before returning a response:

<img width="456" alt="Screenshot 2025-03-13 at 4 40 21 PM"
src="https://github.com/user-attachments/assets/9465c221-2a26-42f8-a08a-6ac4a9fecce8"
/>


## Test Plan

using https://github.com/meta-llama/llama-stack-client-python/pull/181 a
user is able to to run the following:

`llama stack build --template ollama --image-type venv`
`llama stack run --image-type venv
~/.llama/distributions/ollama/ollama-run.yaml`
`llama-stack-client providers inspect ollama`

<img width="378" alt="Screenshot 2025-03-13 at 4 39 35 PM"
src="https://github.com/user-attachments/assets/8273d05d-8bc3-44c6-9e4b-ef95e48d5466"
/>


also, was able to run the new test_list integration test locally with
ollama:

<img width="1509" alt="Screenshot 2025-03-13 at 11 03 40 AM"
src="https://github.com/user-attachments/assets/9b9db166-f02f-45b0-86a4-306d85149bc8"
/>

Signed-off-by: Charlie Doern <cdoern@redhat.com>
---
 docs/_static/llama-stack-spec.html            | 76 ++++++++++++++++++-
 docs/_static/llama-stack-spec.yaml            | 51 +++++++++++++
 llama_stack/apis/datatypes.py                 |  1 +
 llama_stack/apis/inspect/inspect.py           | 18 ++---
 llama_stack/apis/providers/__init__.py        |  7 ++
 llama_stack/apis/providers/providers.py       | 40 ++++++++++
 llama_stack/distribution/configure.py         |  2 +-
 llama_stack/distribution/distribution.py      |  2 +-
 llama_stack/distribution/providers.py         | 59 ++++++++++++++
 llama_stack/distribution/resolver.py          | 21 +++++
 llama_stack/distribution/server/server.py     |  1 +
 llama_stack/distribution/stack.py             |  2 +
 pyproject.toml                                |  1 +
 tests/integration/providers/__init__.py       |  5 ++
 tests/integration/providers/test_providers.py | 17 +++++
 15 files changed, 291 insertions(+), 12 deletions(-)
 create mode 100644 llama_stack/apis/providers/__init__.py
 create mode 100644 llama_stack/apis/providers/providers.py
 create mode 100644 llama_stack/distribution/providers.py
 create mode 100644 tests/integration/providers/__init__.py
 create mode 100644 tests/integration/providers/test_providers.py

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 22fa781ac..e62f66bd6 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -2642,7 +2642,81 @@
                 }
             }
         },
-        "/v1/inspect/providers": {
+        "/v1/providers": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ListProvidersResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Providers"
+                ],
+                "description": "",
+                "parameters": []
+            }
+        },
+        "/v1/providers/{provider_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/GetProviderResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Providers"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "provider_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            },
+            "/v1/inspect/providers": {
             "get": {
                 "responses": {
                     "200": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 1f01351e9..cb31848ee 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -1782,6 +1782,57 @@ paths:
             schema:
               $ref: '#/components/schemas/RegisterModelRequest'
         required: true
+  /v1/providers:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListProvidersResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Providers
+      description: ''
+      parameters: []
+  /v1/providers/{provider_id}:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/GetProviderResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Providers
+      description: ''
+      parameters:
+        - name: provider_id
+          in: path
+          required: true
+          schema:
+            type: string
   /v1/inspect/providers:
     get:
       responses:
diff --git a/llama_stack/apis/datatypes.py b/llama_stack/apis/datatypes.py
index 842a2b63d..f644e5137 100644
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@@ -14,6 +14,7 @@ from llama_stack.schema_utils import json_schema_type
 
 @json_schema_type
 class Api(Enum):
+    providers = "providers"
     inference = "inference"
     safety = "safety"
     agents = "agents"
diff --git a/llama_stack/apis/inspect/inspect.py b/llama_stack/apis/inspect/inspect.py
index 4a647a2d9..25937bb61 100644
--- a/llama_stack/apis/inspect/inspect.py
+++ b/llama_stack/apis/inspect/inspect.py
@@ -11,13 +11,6 @@ from pydantic import BaseModel
 from llama_stack.schema_utils import json_schema_type, webmethod
 
 
-@json_schema_type
-class ProviderInfo(BaseModel):
-    api: str
-    provider_id: str
-    provider_type: str
-
-
 @json_schema_type
 class RouteInfo(BaseModel):
     route: str
@@ -32,14 +25,21 @@ class HealthInfo(BaseModel):
 
 
 @json_schema_type
-class VersionInfo(BaseModel):
-    version: str
+class ProviderInfo(BaseModel):
+    api: str
+    provider_id: str
+    provider_type: str
 
 
 class ListProvidersResponse(BaseModel):
     data: List[ProviderInfo]
 
 
+@json_schema_type
+class VersionInfo(BaseModel):
+    version: str
+
+
 class ListRoutesResponse(BaseModel):
     data: List[RouteInfo]
 
diff --git a/llama_stack/apis/providers/__init__.py b/llama_stack/apis/providers/__init__.py
new file mode 100644
index 000000000..b554a5d23
--- /dev/null
+++ b/llama_stack/apis/providers/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .providers import *  # noqa: F401 F403
diff --git a/llama_stack/apis/providers/providers.py b/llama_stack/apis/providers/providers.py
new file mode 100644
index 000000000..fd37bd500
--- /dev/null
+++ b/llama_stack/apis/providers/providers.py
@@ -0,0 +1,40 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List, Protocol, runtime_checkable
+
+from pydantic import BaseModel
+
+from llama_stack.distribution.datatypes import Provider
+from llama_stack.schema_utils import json_schema_type, webmethod
+
+
+@json_schema_type
+class ProviderInfo(BaseModel):
+    api: str
+    provider_id: str
+    provider_type: str
+
+
+class GetProviderResponse(BaseModel):
+    data: Provider | None
+
+
+class ListProvidersResponse(BaseModel):
+    data: List[ProviderInfo]
+
+
+@runtime_checkable
+class Providers(Protocol):
+    """
+    Providers API for inspecting, listing, and modifying providers and their configurations.
+    """
+
+    @webmethod(route="/providers", method="GET")
+    async def list_providers(self) -> ListProvidersResponse: ...
+
+    @webmethod(route="/providers/{provider_id}", method="GET")
+    async def inspect_provider(self, provider_id: str) -> GetProviderResponse: ...
diff --git a/llama_stack/distribution/configure.py b/llama_stack/distribution/configure.py
index 715bb5db4..2a3bf7053 100644
--- a/llama_stack/distribution/configure.py
+++ b/llama_stack/distribution/configure.py
@@ -62,7 +62,7 @@ def configure_api_providers(config: StackRunConfig, build_spec: DistributionSpec
     if config.apis:
         apis_to_serve = config.apis
     else:
-        apis_to_serve = [a.value for a in Api if a not in (Api.telemetry, Api.inspect)]
+        apis_to_serve = [a.value for a in Api if a not in (Api.telemetry, Api.inspect, Api.providers)]
 
     for api_str in apis_to_serve:
         api = Api(api_str)
diff --git a/llama_stack/distribution/distribution.py b/llama_stack/distribution/distribution.py
index 308081415..ddb727663 100644
--- a/llama_stack/distribution/distribution.py
+++ b/llama_stack/distribution/distribution.py
@@ -56,7 +56,7 @@ def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]:
 
 def providable_apis() -> List[Api]:
     routing_table_apis = {x.routing_table_api for x in builtin_automatically_routed_apis()}
-    return [api for api in Api if api not in routing_table_apis and api != Api.inspect]
+    return [api for api in Api if api not in routing_table_apis and api != Api.inspect and api != Api.providers]
 
 
 def get_provider_registry() -> Dict[Api, Dict[str, ProviderSpec]]:
diff --git a/llama_stack/distribution/providers.py b/llama_stack/distribution/providers.py
new file mode 100644
index 000000000..219384900
--- /dev/null
+++ b/llama_stack/distribution/providers.py
@@ -0,0 +1,59 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pydantic import BaseModel
+
+from llama_stack.apis.providers import GetProviderResponse, ListProvidersResponse, ProviderInfo, Providers
+
+from .datatypes import StackRunConfig
+from .stack import redact_sensitive_fields
+
+
+class ProviderImplConfig(BaseModel):
+    run_config: StackRunConfig
+
+
+async def get_provider_impl(config, deps):
+    impl = ProviderImpl(config, deps)
+    await impl.initialize()
+    return impl
+
+
+class ProviderImpl(Providers):
+    def __init__(self, config, deps):
+        self.config = config
+        self.deps = deps
+
+    async def initialize(self) -> None:
+        pass
+
+    async def list_providers(self) -> ListProvidersResponse:
+        run_config = self.config.run_config
+        ret = []
+        for api, providers in run_config.providers.items():
+            ret.extend(
+                [
+                    ProviderInfo(
+                        api=api,
+                        provider_id=p.provider_id,
+                        provider_type=p.provider_type,
+                    )
+                    for p in providers
+                ]
+            )
+
+        return ListProvidersResponse(data=ret)
+
+    async def inspect_provider(self, provider_id: str) -> GetProviderResponse:
+        run_config = self.config.run_config
+        safe_config = StackRunConfig(**redact_sensitive_fields(run_config.model_dump()))
+        ret = None
+        for _, providers in safe_config.providers.items():
+            for p in providers:
+                if p.provider_id == provider_id:
+                    ret = p
+
+        return GetProviderResponse(data=ret)
diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py
index ab075f399..e9e406699 100644
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@@ -16,6 +16,7 @@ from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
 from llama_stack.apis.post_training import PostTraining
+from llama_stack.apis.providers import Providers as ProvidersAPI
 from llama_stack.apis.safety import Safety
 from llama_stack.apis.scoring import Scoring
 from llama_stack.apis.scoring_functions import ScoringFunctions
@@ -59,6 +60,7 @@ class InvalidProviderError(Exception):
 
 def api_protocol_map() -> Dict[Api, Any]:
     return {
+        Api.providers: ProvidersAPI,
         Api.agents: Agents,
         Api.inference: Inference,
         Api.inspect: Inspect,
@@ -247,6 +249,25 @@ def sort_providers_by_deps(
         )
     )
 
+    sorted_providers.append(
+        (
+            "providers",
+            ProviderWithSpec(
+                provider_id="__builtin__",
+                provider_type="__builtin__",
+                config={"run_config": run_config.model_dump()},
+                spec=InlineProviderSpec(
+                    api=Api.providers,
+                    provider_type="__builtin__",
+                    config_class="llama_stack.distribution.providers.ProviderImplConfig",
+                    module="llama_stack.distribution.providers",
+                    api_dependencies=apis,
+                    deps__=[x.value for x in apis],
+                ),
+            ),
+        )
+    )
+
     logger.debug(f"Resolved {len(sorted_providers)} providers")
     for api_str, provider in sorted_providers:
         logger.debug(f" {api_str} => {provider.provider_id}")
diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index 7ca009b13..8f9500ae9 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -368,6 +368,7 @@ def main():
         apis_to_serve.add(inf.routing_table_api.value)
 
     apis_to_serve.add("inspect")
+    apis_to_serve.add("providers")
     for api_str in apis_to_serve:
         api = Api(api_str)
 
diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py
index 2b974739a..9c9289a77 100644
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@@ -23,6 +23,7 @@ from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
 from llama_stack.apis.post_training import PostTraining
+from llama_stack.apis.providers import Providers
 from llama_stack.apis.safety import Safety
 from llama_stack.apis.scoring import Scoring
 from llama_stack.apis.scoring_functions import ScoringFunctions
@@ -44,6 +45,7 @@ logger = get_logger(name=__name__, category="core")
 
 
 class LlamaStack(
+    Providers,
     VectorDBs,
     Inference,
     BatchInference,
diff --git a/pyproject.toml b/pyproject.toml
index 055fa7a55..aaea4f7c4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -170,6 +170,7 @@ exclude = [
     "^llama_stack/apis/inspect/inspect\\.py$",
     "^llama_stack/apis/models/models\\.py$",
     "^llama_stack/apis/post_training/post_training\\.py$",
+    "^llama_stack/apis/providers/providers\\.py$",
     "^llama_stack/apis/resource\\.py$",
     "^llama_stack/apis/safety/safety\\.py$",
     "^llama_stack/apis/scoring/scoring\\.py$",
diff --git a/tests/integration/providers/__init__.py b/tests/integration/providers/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/tests/integration/providers/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/tests/integration/providers/test_providers.py b/tests/integration/providers/test_providers.py
new file mode 100644
index 000000000..174d01b5c
--- /dev/null
+++ b/tests/integration/providers/test_providers.py
@@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import pytest
+from llama_stack_client import LlamaStackClient
+
+from llama_stack import LlamaStackAsLibraryClient
+
+
+class TestProviders:
+    @pytest.mark.asyncio
+    def test_list(self, llama_stack_client: LlamaStackAsLibraryClient | LlamaStackClient):
+        provider_list = llama_stack_client.providers.list()
+        assert provider_list is not None

From b906bad23803bf947573e6dcc6557b9cb95625ce Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Thu, 13 Mar 2025 18:28:52 -0400
Subject: [PATCH 24/38] docs: Add OpenAI, Anthropic, Gemini to inference API
 providers table (#1622)

# What does this PR do?

Forgot to update this page as well as part of
https://github.com/meta-llama/llama-stack/pull/1617.

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 docs/source/index.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/index.md b/docs/source/index.md
index 0d0508466..af381f9c9 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -61,6 +61,10 @@ A number of "adapters" are available for some popular Inference and Vector Store
 |  Groq  |  Hosted  |
 |  SambaNova  |  Hosted  |
 | PyTorch ExecuTorch | On-device iOS, Android |
+|  OpenAI  |  Hosted  |
+|  Anthropic  |  Hosted  |
+|  Gemini  |  Hosted  |
+
 
 **Vector IO API**
 |  **Provider** |  **Environments** |

From 98b1b15e0fbff1cbf53496c2e28227a1511289c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Thu, 13 Mar 2025 23:34:53 +0100
Subject: [PATCH 25/38] refactor: move all datetime.now() calls to UTC (#1589)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

Updated all instances of datetime.now() to use timezone.utc for
consistency in handling time across different systems. This ensures that
timestamps are always in Coordinated Universal Time (UTC), avoiding
issues with time zone discrepancies and promoting uniformity in
time-related data.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 llama_stack/cli/download.py                   |  4 ++--
 .../llama3/prompt_templates/system_prompts.py |  4 +++-
 .../agents/meta_reference/agent_instance.py   | 24 +++++++++----------
 .../agents/meta_reference/persistence.py      |  4 ++--
 .../post_training/torchtune/post_training.py  |  8 +++----
 .../recipes/lora_finetuning_single_device.py  |  4 ++--
 .../meta_reference/console_span_processor.py  |  8 +++----
 .../meta_reference/sqlite_span_processor.py   | 12 +++++-----
 .../code_interpreter/code_execution.py        |  2 +-
 .../providers/utils/telemetry/tracing.py      |  6 ++---
 pyproject.toml                                | 21 +++++++++-------
 11 files changed, 52 insertions(+), 45 deletions(-)

diff --git a/llama_stack/cli/download.py b/llama_stack/cli/download.py
index b43d50217..f1b722183 100644
--- a/llama_stack/cli/download.py
+++ b/llama_stack/cli/download.py
@@ -10,7 +10,7 @@ import json
 import os
 import shutil
 from dataclasses import dataclass
-from datetime import datetime
+from datetime import datetime, timezone
 from functools import partial
 from pathlib import Path
 from typing import Dict, List, Optional
@@ -404,7 +404,7 @@ def _download_from_manifest(manifest_file: str, max_concurrent_downloads: int):
         d = json.load(f)
         manifest = Manifest(**d)
 
-    if datetime.now() > manifest.expires_on:
+    if datetime.now(timezone.utc) > manifest.expires_on:
         raise ValueError(f"Manifest URLs have expired on {manifest.expires_on}")
 
     console = Console()
diff --git a/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py b/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py
index 02e4814bc..b835d0ec0 100644
--- a/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py
+++ b/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py
@@ -34,7 +34,9 @@ class SystemDefaultGenerator(PromptTemplateGeneratorBase):
         )
         return PromptTemplate(
             template_str.lstrip("\n"),
-            {"today": datetime.now().strftime("%d %B %Y")},
+            {
+                "today": datetime.now().strftime("%d %B %Y")  # noqa: DTZ005 - we don't care about timezones here since we are displaying the date
+            },
         )
 
     def data_examples(self) -> List[Any]:
diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
index 0ae1996cc..03692bcc7 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@@ -11,7 +11,7 @@ import re
 import secrets
 import string
 import uuid
-from datetime import datetime
+from datetime import datetime, timezone
 from typing import AsyncGenerator, List, Optional, Union
 from urllib.parse import urlparse
 
@@ -239,7 +239,7 @@ class ChatAgent(ShieldRunnerMixin):
             in_progress_tool_call_step = await self.storage.get_in_progress_tool_call_step(
                 request.session_id, request.turn_id
             )
-            now = datetime.now().astimezone().isoformat()
+            now = datetime.now(timezone.utc).isoformat()
             tool_execution_step = ToolExecutionStep(
                 step_id=(in_progress_tool_call_step.step_id if in_progress_tool_call_step else str(uuid.uuid4())),
                 turn_id=request.turn_id,
@@ -264,7 +264,7 @@ class ChatAgent(ShieldRunnerMixin):
             start_time = last_turn.started_at
         else:
             messages.extend(request.messages)
-            start_time = datetime.now().astimezone().isoformat()
+            start_time = datetime.now(timezone.utc).isoformat()
             input_messages = request.messages
 
         output_message = None
@@ -295,7 +295,7 @@ class ChatAgent(ShieldRunnerMixin):
             input_messages=input_messages,
             output_message=output_message,
             started_at=start_time,
-            completed_at=datetime.now().astimezone().isoformat(),
+            completed_at=datetime.now(timezone.utc).isoformat(),
             steps=steps,
         )
         await self.storage.add_turn_to_session(request.session_id, turn)
@@ -386,7 +386,7 @@ class ChatAgent(ShieldRunnerMixin):
                 return
 
             step_id = str(uuid.uuid4())
-            shield_call_start_time = datetime.now().astimezone().isoformat()
+            shield_call_start_time = datetime.now(timezone.utc).isoformat()
             try:
                 yield AgentTurnResponseStreamChunk(
                     event=AgentTurnResponseEvent(
@@ -410,7 +410,7 @@ class ChatAgent(ShieldRunnerMixin):
                                 turn_id=turn_id,
                                 violation=e.violation,
                                 started_at=shield_call_start_time,
-                                completed_at=datetime.now().astimezone().isoformat(),
+                                completed_at=datetime.now(timezone.utc).isoformat(),
                             ),
                         )
                     )
@@ -433,7 +433,7 @@ class ChatAgent(ShieldRunnerMixin):
                             turn_id=turn_id,
                             violation=None,
                             started_at=shield_call_start_time,
-                            completed_at=datetime.now().astimezone().isoformat(),
+                            completed_at=datetime.now(timezone.utc).isoformat(),
                         ),
                     )
                 )
@@ -472,7 +472,7 @@ class ChatAgent(ShieldRunnerMixin):
             client_tools[tool.name] = tool
         while True:
             step_id = str(uuid.uuid4())
-            inference_start_time = datetime.now().astimezone().isoformat()
+            inference_start_time = datetime.now(timezone.utc).isoformat()
             yield AgentTurnResponseStreamChunk(
                 event=AgentTurnResponseEvent(
                     payload=AgentTurnResponseStepStartPayload(
@@ -582,7 +582,7 @@ class ChatAgent(ShieldRunnerMixin):
                             turn_id=turn_id,
                             model_response=copy.deepcopy(message),
                             started_at=inference_start_time,
-                            completed_at=datetime.now().astimezone().isoformat(),
+                            completed_at=datetime.now(timezone.utc).isoformat(),
                         ),
                     )
                 )
@@ -653,7 +653,7 @@ class ChatAgent(ShieldRunnerMixin):
                             turn_id=turn_id,
                             tool_calls=[tool_call],
                             tool_responses=[],
-                            started_at=datetime.now().astimezone().isoformat(),
+                            started_at=datetime.now(timezone.utc).isoformat(),
                         ),
                     )
                     yield message
@@ -670,7 +670,7 @@ class ChatAgent(ShieldRunnerMixin):
                         "input": message.model_dump_json(),
                     },
                 ) as span:
-                    tool_execution_start_time = datetime.now().astimezone().isoformat()
+                    tool_execution_start_time = datetime.now(timezone.utc).isoformat()
                     tool_call = message.tool_calls[0]
                     tool_result = await self.execute_tool_call_maybe(
                         session_id,
@@ -708,7 +708,7 @@ class ChatAgent(ShieldRunnerMixin):
                                     )
                                 ],
                                 started_at=tool_execution_start_time,
-                                completed_at=datetime.now().astimezone().isoformat(),
+                                completed_at=datetime.now(timezone.utc).isoformat(),
                             ),
                         )
                     )
diff --git a/llama_stack/providers/inline/agents/meta_reference/persistence.py b/llama_stack/providers/inline/agents/meta_reference/persistence.py
index 2c04305fd..e7d7d1828 100644
--- a/llama_stack/providers/inline/agents/meta_reference/persistence.py
+++ b/llama_stack/providers/inline/agents/meta_reference/persistence.py
@@ -7,7 +7,7 @@
 import json
 import logging
 import uuid
-from datetime import datetime
+from datetime import datetime, timezone
 from typing import List, Optional
 
 from pydantic import BaseModel
@@ -36,7 +36,7 @@ class AgentPersistence:
         session_info = AgentSessionInfo(
             session_id=session_id,
             session_name=name,
-            started_at=datetime.now(),
+            started_at=datetime.now(timezone.utc),
         )
         await self.kvstore.set(
             key=f"session:{self.agent_id}:{session_id}",
diff --git a/llama_stack/providers/inline/post_training/torchtune/post_training.py b/llama_stack/providers/inline/post_training/torchtune/post_training.py
index 3a1affc91..2c129ef41 100644
--- a/llama_stack/providers/inline/post_training/torchtune/post_training.py
+++ b/llama_stack/providers/inline/post_training/torchtune/post_training.py
@@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from datetime import datetime
+from datetime import datetime, timezone
 from typing import Any, Dict, Optional
 
 from llama_stack.apis.datasetio import DatasetIO
@@ -64,7 +64,7 @@ class TorchtunePostTrainingImpl:
         job_status_response = PostTrainingJobStatusResponse(
             job_uuid=job_uuid,
             status=JobStatus.scheduled,
-            scheduled_at=datetime.now(),
+            scheduled_at=datetime.now(timezone.utc),
         )
         self.jobs[job_uuid] = job_status_response
 
@@ -84,7 +84,7 @@ class TorchtunePostTrainingImpl:
                 )
 
                 job_status_response.status = JobStatus.in_progress
-                job_status_response.started_at = datetime.now()
+                job_status_response.started_at = datetime.now(timezone.utc)
 
                 await recipe.setup()
                 resources_allocated, checkpoints = await recipe.train()
@@ -93,7 +93,7 @@ class TorchtunePostTrainingImpl:
                 job_status_response.resources_allocated = resources_allocated
                 job_status_response.checkpoints = checkpoints
                 job_status_response.status = JobStatus.completed
-                job_status_response.completed_at = datetime.now()
+                job_status_response.completed_at = datetime.now(timezone.utc)
 
             except Exception:
                 job_status_response.status = JobStatus.failed
diff --git a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
index c88787f18..941c629e3 100644
--- a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
+++ b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
@@ -8,7 +8,7 @@ import gc
 import logging
 import os
 import time
-from datetime import datetime
+from datetime import datetime, timezone
 from functools import partial
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
@@ -532,7 +532,7 @@ class LoraFinetuningSingleDevice:
             checkpoint_path = await self.save_checkpoint(epoch=curr_epoch)
             checkpoint = Checkpoint(
                 identifier=f"{self.model_id}-sft-{curr_epoch}",
-                created_at=datetime.now(),
+                created_at=datetime.now(timezone.utc),
                 epoch=curr_epoch,
                 post_training_job_id=self.job_uuid,
                 path=checkpoint_path,
diff --git a/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py
index 2e3bd4d3a..42b538876 100644
--- a/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 import json
-from datetime import datetime
+from datetime import datetime, timezone
 
 from opentelemetry.sdk.trace import ReadableSpan
 from opentelemetry.sdk.trace.export import SpanProcessor
@@ -34,7 +34,7 @@ class ConsoleSpanProcessor(SpanProcessor):
         if span.attributes and span.attributes.get("__autotraced__"):
             return
 
-        timestamp = datetime.utcfromtimestamp(span.start_time / 1e9).strftime("%H:%M:%S.%f")[:-3]
+        timestamp = datetime.fromtimestamp(span.start_time / 1e9, tz=timezone.utc).strftime("%H:%M:%S.%f")[:-3]
 
         print(
             f"{COLORS['dim']}{timestamp}{COLORS['reset']} "
@@ -46,7 +46,7 @@ class ConsoleSpanProcessor(SpanProcessor):
         if span.attributes and span.attributes.get("__autotraced__"):
             return
 
-        timestamp = datetime.utcfromtimestamp(span.end_time / 1e9).strftime("%H:%M:%S.%f")[:-3]
+        timestamp = datetime.fromtimestamp(span.end_time / 1e9, tz=timezone.utc).strftime("%H:%M:%S.%f")[:-3]
 
         span_context = (
             f"{COLORS['dim']}{timestamp}{COLORS['reset']} "
@@ -74,7 +74,7 @@ class ConsoleSpanProcessor(SpanProcessor):
                 print(f"    {COLORS['dim']}{key}: {str_value}{COLORS['reset']}")
 
         for event in span.events:
-            event_time = datetime.utcfromtimestamp(event.timestamp / 1e9).strftime("%H:%M:%S.%f")[:-3]
+            event_time = datetime.fromtimestamp(event.timestamp / 1e9, tz=timezone.utc).strftime("%H:%M:%S.%f")[:-3]
 
             severity = event.attributes.get("severity", "info")
             message = event.attributes.get("message", event.name)
diff --git a/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py b/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py
index b39ee7716..5ed586fce 100644
--- a/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py
@@ -8,7 +8,7 @@ import json
 import os
 import sqlite3
 import threading
-from datetime import datetime
+from datetime import datetime, timezone
 
 from opentelemetry.sdk.trace import SpanProcessor
 from opentelemetry.trace import Span
@@ -124,8 +124,8 @@ class SQLiteSpanProcessor(SpanProcessor):
                     trace_id,
                     service_name,
                     (span_id if not parent_span_id else None),
-                    datetime.fromtimestamp(span.start_time / 1e9).isoformat(),
-                    datetime.fromtimestamp(span.end_time / 1e9).isoformat(),
+                    datetime.fromtimestamp(span.start_time / 1e9, timezone.utc).isoformat(),
+                    datetime.fromtimestamp(span.end_time / 1e9, timezone.utc).isoformat(),
                 ),
             )
 
@@ -143,8 +143,8 @@ class SQLiteSpanProcessor(SpanProcessor):
                     trace_id,
                     parent_span_id,
                     span.name,
-                    datetime.fromtimestamp(span.start_time / 1e9).isoformat(),
-                    datetime.fromtimestamp(span.end_time / 1e9).isoformat(),
+                    datetime.fromtimestamp(span.start_time / 1e9, timezone.utc).isoformat(),
+                    datetime.fromtimestamp(span.end_time / 1e9, timezone.utc).isoformat(),
                     json.dumps(dict(span.attributes)),
                     span.status.status_code.name,
                     span.kind.name,
@@ -161,7 +161,7 @@ class SQLiteSpanProcessor(SpanProcessor):
                     (
                         span_id,
                         event.name,
-                        datetime.fromtimestamp(event.timestamp / 1e9).isoformat(),
+                        datetime.fromtimestamp(event.timestamp / 1e9, timezone.utc).isoformat(),
                         json.dumps(dict(event.attributes)),
                     ),
                 )
diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
index d7b2dbdef..810591c1c 100644
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
@@ -168,7 +168,7 @@ def process_matplotlib_response(response, matplotlib_dump_dir: str):
     image_paths = []
     for i, img in enumerate(images):
         # create new directory for each day to better organize data:
-        dump_dname = datetime.today().strftime("%Y-%m-%d")
+        dump_dname = datetime.today().strftime("%Y-%m-%d")  # noqa: DTZ002 - we don't care about timezones here since we are displaying the date
         dump_dpath = Path(matplotlib_dump_dir, dump_dname)
         dump_dpath.mkdir(parents=True, exist_ok=True)
         # save image into a file
diff --git a/llama_stack/providers/utils/telemetry/tracing.py b/llama_stack/providers/utils/telemetry/tracing.py
index bef229080..607d1a918 100644
--- a/llama_stack/providers/utils/telemetry/tracing.py
+++ b/llama_stack/providers/utils/telemetry/tracing.py
@@ -11,7 +11,7 @@ import logging
 import queue
 import threading
 import uuid
-from datetime import datetime
+from datetime import datetime, timezone
 from functools import wraps
 from typing import Any, Callable, Dict, List, Optional
 
@@ -86,7 +86,7 @@ class TraceContext:
             span_id=generate_short_uuid(),
             trace_id=self.trace_id,
             name=name,
-            start_time=datetime.now(),
+            start_time=datetime.now(timezone.utc),
             parent_span_id=current_span.span_id if current_span else None,
             attributes=attributes,
         )
@@ -203,7 +203,7 @@ class TelemetryHandler(logging.Handler):
             UnstructuredLogEvent(
                 trace_id=span.trace_id,
                 span_id=span.span_id,
-                timestamp=datetime.now(),
+                timestamp=datetime.now(timezone.utc),
                 message=self.format(record),
                 severity=severity(record.levelname),
             )
diff --git a/pyproject.toml b/pyproject.toml
index aaea4f7c4..ff7f46f77 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -124,14 +124,15 @@ exclude = [
 
 [tool.ruff.lint]
 select = [
-    "B",  # flake8-bugbear
-    "B9", # flake8-bugbear subset
-    "C",  # comprehensions
-    "E",  # pycodestyle
-    "F",  # Pyflakes
-    "N",  # Naming
-    "W",  # Warnings
-    "I",  # isort
+    "B",   # flake8-bugbear
+    "B9",  # flake8-bugbear subset
+    "C",   # comprehensions
+    "E",   # pycodestyle
+    "F",   # Pyflakes
+    "N",   # Naming
+    "W",   # Warnings
+    "I",   # isort
+    "DTZ", # datetime rules
 ]
 ignore = [
     # The following ignores are desired by the project maintainers.
@@ -145,6 +146,10 @@ ignore = [
     "C901", # Complexity of the function is too high
 ]
 
+# Ignore the following errors for the following files
+[tool.ruff.lint.per-file-ignores]
+"tests/**/*.py" = ["DTZ"] # Ignore datetime rules for tests
+
 [tool.mypy]
 mypy_path = ["llama_stack"]
 packages = ["llama_stack"]

From c02464b6356006744e80c1e8fb96a4d22b1392ba Mon Sep 17 00:00:00 2001
From: Alina Ryan <66272285+alinaryan@users.noreply.github.com>
Date: Thu, 13 Mar 2025 20:47:09 -0400
Subject: [PATCH 26/38] fix: Clarify `llama model prompt-format` help text
 (#1010)

# What does this PR do?
Updates the help text for the `llama model prompt-format` command to
clarify that users should provide a specific model name (e.g.,
Llama3.1-8B, Llama3.2-11B-Vision), not a model family. Removes the
default value and field for `--model-name` to prevent users from
mistakenly thinking a model family name is acceptable. Adds guidance to
run `llama model list` to view valid model names.

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan

Output of `llama model prompt-format -h` Before:
```
(venv) alina@fedora:~/dev/llama/llama-stack$ llama model prompt-format -h
usage: llama model prompt-format [-h] [-m MODEL_NAME]

Show llama model message formats

options:
  -h, --help            show this help message and exit
  -m MODEL_NAME, --model-name MODEL_NAME
                        Model Family (llama3_1, llama3_X, etc.)

Example:
    llama model prompt-format <options>
(venv) alina@fedora:~/dev/llama/llama-stack$ llama model prompt-format --model-name llama3_1
usage: llama model prompt-format [-h] [-m MODEL_NAME]
llama model prompt-format: error: llama3_1 is not a valid Model. Choose one from --
Llama3.1-8B
Llama3.1-70B
Llama3.1-405B
Llama3.1-8B-Instruct
Llama3.1-70B-Instruct
Llama3.1-405B-Instruct
Llama3.2-1B
Llama3.2-3B
Llama3.2-1B-Instruct
Llama3.2-3B-Instruct
Llama3.2-11B-Vision
Llama3.2-90B-Vision
Llama3.2-11B-Vision-Instruct
Llama3.2-90B-Vision-Instruct
```

Output of `llama model prompt-format -h` After:
```
(venv) alina@fedora:~/dev/llama/llama-stack$ llama model prompt-format -h
usage: llama model prompt-format [-h] [-m MODEL_NAME]

Show llama model message formats

options:
  -h, --help            show this help message and exit
  -m MODEL_NAME, --model-name MODEL_NAME
                        Example: Llama3.1-8B or Llama3.2-11B-Vision, etc
                        (Run `llama model list` to see a list of valid model names)

Example:
    llama model prompt-format <options>

```

Signed-off-by: Alina Ryan <aliryan@redhat.com>
---
 llama_stack/cli/model/prompt_format.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/llama_stack/cli/model/prompt_format.py b/llama_stack/cli/model/prompt_format.py
index 8058db461..0d62dcc11 100644
--- a/llama_stack/cli/model/prompt_format.py
+++ b/llama_stack/cli/model/prompt_format.py
@@ -41,8 +41,8 @@ class ModelPromptFormat(Subcommand):
             "-m",
             "--model-name",
             type=str,
-            default="llama3_1",
-            help="Model Family (llama3_1, llama3_X, etc.)",
+            help="Example: Llama3.1-8B or Llama3.2-11B-Vision, etc\n"
+            "(Run `llama model list` to see a list of valid model names)",
         )
         self.parser.add_argument(
             "-l",
@@ -81,10 +81,16 @@ class ModelPromptFormat(Subcommand):
         try:
             model_id = CoreModelId(args.model_name)
         except ValueError:
-            self.parser.error(f"{args.model_name} is not a valid Model. Choose one from --\n{model_str}")
+            self.parser.error(
+                f"{args.model_name} is not a valid Model. Choose one from the list of valid models. "
+                f"Run `llama model list` to see the valid model names."
+            )
 
         if model_id not in supported_model_ids:
-            self.parser.error(f"{model_id} is not a valid Model. Choose one from --\n {model_str}")
+            self.parser.error(
+                f"{model_id} is not a valid Model. Choose one from the list of valid models. "
+                f"Run `llama model list` to see the valid model names."
+            )
 
         llama_3_1_file = ROOT_DIR / "models" / "llama" / "llama3_1" / "prompt_format.md"
         llama_3_2_text_file = ROOT_DIR / "models" / "llama" / "llama3_2" / "text_prompt_format.md"

From ca0cbf4338957c347848a5274fec752bd4255d48 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Thu, 13 Mar 2025 21:57:42 -0400
Subject: [PATCH 27/38] fix: Fix pre-commit check (#1628)

# What does this PR do?

Fixes pre-commit check failure after merging
https://github.com/meta-llama/llama-stack/pull/1010:
https://github.com/meta-llama/llama-stack/actions/runs/13847489719/job/38748770971

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 llama_stack/cli/model/prompt_format.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llama_stack/cli/model/prompt_format.py b/llama_stack/cli/model/prompt_format.py
index 0d62dcc11..3ce77655b 100644
--- a/llama_stack/cli/model/prompt_format.py
+++ b/llama_stack/cli/model/prompt_format.py
@@ -60,7 +60,6 @@ class ModelPromptFormat(Subcommand):
         ]
 
         model_list = [m.value for m in supported_model_ids]
-        model_str = "\n".join(model_list)
 
         if args.list:
             headers = ["Model(s)"]

From 9e73341008eb7cab053af5b58b26ed669d2c4bd1 Mon Sep 17 00:00:00 2001
From: Kai Wu <kaiwu@meta.com>
Date: Thu, 13 Mar 2025 18:58:12 -0700
Subject: [PATCH 28/38] fix: change dog.jpg path in test_vision_inference.py
 (#1624)

# What does this PR do?
quick fix as the vision_inference test dog.jpg path has been changed.
[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

[//]: # (## Documentation)
---
 tests/integration/inference/test_vision_inference.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integration/inference/test_vision_inference.py b/tests/integration/inference/test_vision_inference.py
index 984e563d7..9f6fb0478 100644
--- a/tests/integration/inference/test_vision_inference.py
+++ b/tests/integration/inference/test_vision_inference.py
@@ -36,7 +36,7 @@ def test_image_chat_completion_non_streaming(client_with_models, vision_model_id
                 "type": "image",
                 "image": {
                     "url": {
-                        "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/api/inference/dog.png"
+                        "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png"
                     },
                 },
             },
@@ -65,7 +65,7 @@ def test_image_chat_completion_streaming(client_with_models, vision_model_id):
                 "type": "image",
                 "image": {
                     "url": {
-                        "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/api/inference/dog.png"
+                        "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png"
                     },
                 },
             },

From 33b096cc21e48910cf05f0c3e513032adb99fa84 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 13 Mar 2025 19:56:32 -0700
Subject: [PATCH 29/38] fix: OpenAPI with provider get (#1627)

# What does this PR do?
- https://github.com/meta-llama/llama-stack/pull/1429 introduces
GetProviderResponse in OpenAPI, which is not needed, and not correctly
defined.

cc @cdoern


[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
```
llama-stack-client providers list
```
<img width="610" alt="image"
src="https://github.com/user-attachments/assets/2f7b62a5-daf2-4bf9-9505-69755c7025fc"
/>


[//]: # (## Documentation)
---
 docs/_static/llama-stack-spec.html      | 189 ++++++++++++------------
 docs/_static/llama-stack-spec.yaml      | 124 ++++++++--------
 llama_stack/apis/providers/providers.py |  10 +-
 llama_stack/distribution/providers.py   |  22 +--
 4 files changed, 166 insertions(+), 179 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index e62f66bd6..b5e4097d9 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -2151,6 +2151,48 @@
                 }
             }
         },
+        "/v1/providers/{provider_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ProviderInfo"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Providers"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "provider_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
         "/v1/tool-runtime/invoke": {
             "post": {
                 "responses": {
@@ -2643,80 +2685,6 @@
             }
         },
         "/v1/providers": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/ListProvidersResponse"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Providers"
-                ],
-                "description": "",
-                "parameters": []
-            }
-        },
-        "/v1/providers/{provider_id}": {
-            "get": {
-                "responses": {
-                    "200": {
-                        "description": "OK",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/GetProviderResponse"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Providers"
-                ],
-                "description": "",
-                "parameters": [
-                    {
-                        "name": "provider_id",
-                        "in": "path",
-                        "required": true,
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                ]
-            },
-            "/v1/inspect/providers": {
             "get": {
                 "responses": {
                     "200": {
@@ -7986,6 +7954,53 @@
                 ],
                 "title": "InsertChunksRequest"
             },
+            "ProviderInfo": {
+                "type": "object",
+                "properties": {
+                    "api": {
+                        "type": "string"
+                    },
+                    "provider_id": {
+                        "type": "string"
+                    },
+                    "provider_type": {
+                        "type": "string"
+                    },
+                    "config": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "api",
+                    "provider_id",
+                    "provider_type",
+                    "config"
+                ],
+                "title": "ProviderInfo"
+            },
             "InvokeToolRequest": {
                 "type": "object",
                 "properties": {
@@ -8198,27 +8213,6 @@
                 ],
                 "title": "ListModelsResponse"
             },
-            "ProviderInfo": {
-                "type": "object",
-                "properties": {
-                    "api": {
-                        "type": "string"
-                    },
-                    "provider_id": {
-                        "type": "string"
-                    },
-                    "provider_type": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "api",
-                    "provider_id",
-                    "provider_type"
-                ],
-                "title": "ProviderInfo"
-            },
             "ListProvidersResponse": {
                 "type": "object",
                 "properties": {
@@ -10219,6 +10213,10 @@
         {
             "name": "PostTraining (Coming Soon)"
         },
+        {
+            "name": "Providers",
+            "x-displayName": "Providers API for inspecting, listing, and modifying providers and their configurations."
+        },
         {
             "name": "Safety"
         },
@@ -10265,6 +10263,7 @@
                 "Inspect",
                 "Models",
                 "PostTraining (Coming Soon)",
+                "Providers",
                 "Safety",
                 "Scoring",
                 "ScoringFunctions",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index cb31848ee..bf2343ede 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -1444,6 +1444,34 @@ paths:
             schema:
               $ref: '#/components/schemas/InsertChunksRequest'
         required: true
+  /v1/providers/{provider_id}:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ProviderInfo'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Providers
+      description: ''
+      parameters:
+        - name: provider_id
+          in: path
+          required: true
+          schema:
+            type: string
   /v1/tool-runtime/invoke:
     post:
       responses:
@@ -1783,57 +1811,6 @@ paths:
               $ref: '#/components/schemas/RegisterModelRequest'
         required: true
   /v1/providers:
-    get:
-      responses:
-        '200':
-          description: OK
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListProvidersResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Providers
-      description: ''
-      parameters: []
-  /v1/providers/{provider_id}:
-    get:
-      responses:
-        '200':
-          description: OK
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/GetProviderResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Providers
-      description: ''
-      parameters:
-        - name: provider_id
-          in: path
-          required: true
-          schema:
-            type: string
-  /v1/inspect/providers:
     get:
       responses:
         '200':
@@ -5460,6 +5437,32 @@ components:
         - vector_db_id
         - chunks
       title: InsertChunksRequest
+    ProviderInfo:
+      type: object
+      properties:
+        api:
+          type: string
+        provider_id:
+          type: string
+        provider_type:
+          type: string
+        config:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - api
+        - provider_id
+        - provider_type
+        - config
+      title: ProviderInfo
     InvokeToolRequest:
       type: object
       properties:
@@ -5595,21 +5598,6 @@ components:
       required:
         - data
       title: ListModelsResponse
-    ProviderInfo:
-      type: object
-      properties:
-        api:
-          type: string
-        provider_id:
-          type: string
-        provider_type:
-          type: string
-      additionalProperties: false
-      required:
-        - api
-        - provider_id
-        - provider_type
-      title: ProviderInfo
     ListProvidersResponse:
       type: object
       properties:
@@ -6883,6 +6871,9 @@ tags:
   - name: Inspect
   - name: Models
   - name: PostTraining (Coming Soon)
+  - name: Providers
+    x-displayName: >-
+      Providers API for inspecting, listing, and modifying providers and their configurations.
   - name: Safety
   - name: Scoring
   - name: ScoringFunctions
@@ -6907,6 +6898,7 @@ x-tagGroups:
       - Inspect
       - Models
       - PostTraining (Coming Soon)
+      - Providers
       - Safety
       - Scoring
       - ScoringFunctions
diff --git a/llama_stack/apis/providers/providers.py b/llama_stack/apis/providers/providers.py
index fd37bd500..83d03d7c1 100644
--- a/llama_stack/apis/providers/providers.py
+++ b/llama_stack/apis/providers/providers.py
@@ -4,11 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import List, Protocol, runtime_checkable
+from typing import Any, Dict, List, Protocol, runtime_checkable
 
 from pydantic import BaseModel
 
-from llama_stack.distribution.datatypes import Provider
 from llama_stack.schema_utils import json_schema_type, webmethod
 
 
@@ -17,10 +16,7 @@ class ProviderInfo(BaseModel):
     api: str
     provider_id: str
     provider_type: str
-
-
-class GetProviderResponse(BaseModel):
-    data: Provider | None
+    config: Dict[str, Any]
 
 
 class ListProvidersResponse(BaseModel):
@@ -37,4 +33,4 @@ class Providers(Protocol):
     async def list_providers(self) -> ListProvidersResponse: ...
 
     @webmethod(route="/providers/{provider_id}", method="GET")
-    async def inspect_provider(self, provider_id: str) -> GetProviderResponse: ...
+    async def inspect_provider(self, provider_id: str) -> ProviderInfo: ...
diff --git a/llama_stack/distribution/providers.py b/llama_stack/distribution/providers.py
index 219384900..fb2476767 100644
--- a/llama_stack/distribution/providers.py
+++ b/llama_stack/distribution/providers.py
@@ -4,9 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+
 from pydantic import BaseModel
 
-from llama_stack.apis.providers import GetProviderResponse, ListProvidersResponse, ProviderInfo, Providers
+from llama_stack.apis.providers import ListProvidersResponse, ProviderInfo, Providers
 
 from .datatypes import StackRunConfig
 from .stack import redact_sensitive_fields
@@ -32,14 +33,16 @@ class ProviderImpl(Providers):
 
     async def list_providers(self) -> ListProvidersResponse:
         run_config = self.config.run_config
+        safe_config = StackRunConfig(**redact_sensitive_fields(run_config.model_dump()))
         ret = []
-        for api, providers in run_config.providers.items():
+        for api, providers in safe_config.providers.items():
             ret.extend(
                 [
                     ProviderInfo(
                         api=api,
                         provider_id=p.provider_id,
                         provider_type=p.provider_type,
+                        config=p.config,
                     )
                     for p in providers
                 ]
@@ -47,13 +50,10 @@ class ProviderImpl(Providers):
 
         return ListProvidersResponse(data=ret)
 
-    async def inspect_provider(self, provider_id: str) -> GetProviderResponse:
-        run_config = self.config.run_config
-        safe_config = StackRunConfig(**redact_sensitive_fields(run_config.model_dump()))
-        ret = None
-        for _, providers in safe_config.providers.items():
-            for p in providers:
-                if p.provider_id == provider_id:
-                    ret = p
+    async def inspect_provider(self, provider_id: str) -> ProviderInfo:
+        all_providers = await self.list_providers()
+        for p in all_providers.data:
+            if p.provider_id == provider_id:
+                return p
 
-        return GetProviderResponse(data=ret)
+        raise ValueError(f"Provider {provider_id} not found")

From bfc79217a8837b6615209fd0e530b3fe004921ef Mon Sep 17 00:00:00 2001
From: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
Date: Thu, 13 Mar 2025 23:25:15 -0400
Subject: [PATCH 30/38] chore: Add ./scripts/unit-tests.sh (#1515)

# What does this PR do?
Useful for local development. Now you can just trigger the script and
not care about specific arguments to pass to run unit tests.

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan

```
$ . ./venv/bin/activate
$ ./scripts/run_tests.sh
$ echo $?
0
```

[//]: # (## Documentation)

Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
Co-authored-by: Nathan Weinberg <31703736+nathan-weinberg@users.noreply.github.com>
---
 .github/workflows/unit-tests.yml |  2 +-
 CONTRIBUTING.md                  | 16 ++++++++++++++++
 scripts/unit-tests.sh            | 19 +++++++++++++++++++
 3 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100755 scripts/unit-tests.sh

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 517b5c39a..c7a30e9b8 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -33,7 +33,7 @@ jobs:
 
       - name: Run unit tests
         run: |
-          uv run --python ${{ matrix.python }} --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest --cov=llama_stack -s -v tests/unit/ --junitxml=pytest-report-${{ matrix.python }}.xml --cov-report=html:htmlcov-${{ matrix.python }}
+          PYTHON_VERSION=${{ matrix.python }} ./scripts/unit-tests.sh --cov=llama_stack --junitxml=pytest-report-${{ matrix.python }}.xml --cov-report=html:htmlcov-${{ matrix.python }}
 
       - name: Upload test results
         if: always()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 71e610064..1f188f259 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -108,6 +108,22 @@ uv run pre-commit run --all-files
 > [!CAUTION]
 > Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
 
+## Running unit tests
+
+You can run the unit tests by running:
+
+```bash
+source .venv/bin/activate
+./scripts/unit-tests.sh
+```
+
+If you'd like to run for a non-default version of Python (currently 3.10), pass `PYTHON_VERSION` variable as follows:
+
+```
+source .venv/bin/activate
+PYTHON_VERSION=3.13 ./scripts/unit-tests.sh
+```
+
 ## Adding a new dependency to the project
 
 To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run:
diff --git a/scripts/unit-tests.sh b/scripts/unit-tests.sh
new file mode 100755
index 000000000..dbc25e06b
--- /dev/null
+++ b/scripts/unit-tests.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+PYTHON_VERSION=${PYTHON_VERSION:-3.10}
+
+command -v uv >/dev/null 2>&1 || { echo >&2 "uv is required but it's not installed. Exiting."; exit 1; }
+
+uv python find $PYTHON_VERSION
+FOUND_PYTHON=$?
+if [ $FOUND_PYTHON -ne 0 ]; then
+     uv python install $PYTHON_VERSION
+fi
+
+uv run --python $PYTHON_VERSION --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest -s -v tests/unit/ $@

From e3e7013ac8e64af382de2f6885f6094aedcffc11 Mon Sep 17 00:00:00 2001
From: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
Date: Fri, 14 Mar 2025 12:20:49 -0400
Subject: [PATCH 31/38] chore: Add pre-commit check to sync api spec docs
 (#1609)

# What does this PR do?

It will fail if the newly generated spec docs are different.

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan

```
$ pre-commit run --all-files
check for merge conflicts................................................Passed
trim trailing whitespace.................................................Passed
check for added large files..............................................Passed
fix end of files.........................................................Passed
Insert license in comments...............................................Passed
ruff.....................................................................Passed
ruff-format..............................................................Passed
blacken-docs.............................................................Passed
uv-lock..................................................................Passed
uv-export................................................................Passed
mypy.....................................................................Passed
Distribution Template Codegen............................................Passed
API Spec Codegen.........................................................Passed
```

Now add a field to existing API. Repeat:

```
$ pre-commit run --all-files
check for merge conflicts................................................Passed
trim trailing whitespace.................................................Passed
check for added large files..............................................Passed
fix end of files.........................................................Passed
Insert license in comments...............................................Passed
ruff.....................................................................Passed
ruff-format..............................................................Passed
blacken-docs.............................................................Passed
uv-lock..................................................................Passed
uv-export................................................................Passed
mypy.....................................................................Passed
Distribution Template Codegen............................................Passed
API Spec Codegen.........................................................Failed
- hook id: openapi-codegen
- files were modified by this hook
```

[//]: # (## Documentation)

Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
---
 .pre-commit-config.yaml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 80a303b09..072fa21e2 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -83,6 +83,17 @@ repos:
         require_serial: true
         files: ^llama_stack/templates/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
 
+-   repo: local
+    hooks:
+      - id: openapi-codegen
+        name: API Spec Codegen
+        additional_dependencies:
+          - uv==0.6.2
+        entry: sh -c 'uv run --with ".[dev]" ./docs/openapi_generator/run_openapi_generator.sh > /dev/null 2>&1'
+        language: python
+        pass_filenames: false
+        require_serial: true
+
 ci:
     autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
     autoupdate_commit_msg: ⬆ [pre-commit.ci] pre-commit autoupdate

From 78d4872c0cdb39643b4109735b667df1568c8443 Mon Sep 17 00:00:00 2001
From: Charlie Doern <cdoern@redhat.com>
Date: Fri, 14 Mar 2025 15:36:25 -0400
Subject: [PATCH 32/38] feat: add support for logging config in the run.yaml
 (#1408)

# What does this PR do?

a user should be able to store a static logging configuration outside of
their environment. This would make sense to store in the run yaml given
that we store other things like server configuration in there.

The environment variable settings override the config settings if both
are available.

The format in the config looks like this:

```
logging_config:
  category_levels:
    VALID_CATEGORY: VALID_STRING_LOG_LEVEL
```

any specified category out of the following:

`core | server | router | inference | agents | safety | eval | tools |
client`

combined with any of the following log levels:

`debug | info | warning | error | critical`

can be placed in the category_levels list in order to achieve the
desired log level

## Test Plan

Test locally with a run config like the following:

```
version: '2'
image_name: ollama
logging_config:
  category_levels:
      server: debug
apis:
...
```

Signed-off-by: Charlie Doern <cdoern@redhat.com>
---
 llama_stack/distribution/datatypes.py     | 10 +++
 llama_stack/distribution/server/server.py | 36 ++++++----
 llama_stack/log.py                        | 82 +++++++++++++++++------
 3 files changed, 93 insertions(+), 35 deletions(-)

diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py
index f62996081..7e1d8c016 100644
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@@ -117,6 +117,14 @@ class Provider(BaseModel):
     config: Dict[str, Any]
 
 
+class LoggingConfig(BaseModel):
+    category_levels: Dict[str, str] = Field(
+        default_factory=Dict,
+        description="""
+ Dictionary of different logging configurations for different portions (ex: core, server) of llama stack""",
+    )
+
+
 class ServerConfig(BaseModel):
     port: int = Field(
         default=8321,
@@ -176,6 +184,8 @@ a default SQLite store will be used.""",
     benchmarks: List[BenchmarkInput] = Field(default_factory=list)
     tool_groups: List[ToolGroupInput] = Field(default_factory=list)
 
+    logging: Optional[LoggingConfig] = Field(default=None, description="Configuration for Llama Stack Logging")
+
     server: ServerConfig = Field(
         default_factory=ServerConfig,
         description="Configuration for the HTTP(S) server",
diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index 8f9500ae9..b37b3a007 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -25,7 +25,7 @@ from fastapi.responses import JSONResponse, StreamingResponse
 from pydantic import BaseModel, ValidationError
 from typing_extensions import Annotated
 
-from llama_stack.distribution.datatypes import StackRunConfig
+from llama_stack.distribution.datatypes import LoggingConfig, StackRunConfig
 from llama_stack.distribution.distribution import builtin_automatically_routed_apis
 from llama_stack.distribution.request_headers import (
     PROVIDER_DATA_VAR,
@@ -306,34 +306,42 @@ def main():
 
     args = parser.parse_args()
 
-    if args.env:
-        for env_pair in args.env:
-            try:
-                key, value = validate_env_pair(env_pair)
-                logger.info(f"Setting CLI environment variable {key} => {value}")
-                os.environ[key] = value
-            except ValueError as e:
-                logger.error(f"Error: {str(e)}")
-                sys.exit(1)
-
+    log_line = ""
     if args.yaml_config:
         # if the user provided a config file, use it, even if template was specified
         config_file = Path(args.yaml_config)
         if not config_file.exists():
             raise ValueError(f"Config file {config_file} does not exist")
-        logger.info(f"Using config file: {config_file}")
+        log_line = f"Using config file: {config_file}"
     elif args.template:
         config_file = Path(REPO_ROOT) / "llama_stack" / "templates" / args.template / "run.yaml"
         if not config_file.exists():
             raise ValueError(f"Template {args.template} does not exist")
-        logger.info(f"Using template {args.template} config file: {config_file}")
+        log_line = f"Using template {args.template} config file: {config_file}"
     else:
         raise ValueError("Either --yaml-config or --template must be provided")
 
+    logger_config = None
     with open(config_file, "r") as fp:
-        config = replace_env_vars(yaml.safe_load(fp))
+        config_contents = yaml.safe_load(fp)
+        if isinstance(config_contents, dict) and (cfg := config_contents.get("logging_config")):
+            logger_config = LoggingConfig(**cfg)
+        logger = get_logger(name=__name__, category="server", config=logger_config)
+        if args.env:
+            for env_pair in args.env:
+                try:
+                    key, value = validate_env_pair(env_pair)
+                    logger.info(f"Setting CLI environment variable {key} => {value}")
+                    os.environ[key] = value
+                except ValueError as e:
+                    logger.error(f"Error: {str(e)}")
+                    sys.exit(1)
+        config = replace_env_vars(config_contents)
         config = StackRunConfig(**config)
 
+    # now that the logger is initialized, print the line about which type of config we are using.
+    logger.info(log_line)
+
     logger.info("Run configuration:")
     safe_config = redact_sensitive_fields(config.model_dump())
     logger.info(yaml.dump(safe_config, indent=2))
diff --git a/llama_stack/log.py b/llama_stack/log.py
index 572dea234..0ba95d547 100644
--- a/llama_stack/log.py
+++ b/llama_stack/log.py
@@ -7,13 +7,15 @@
 import logging
 import os
 from logging.config import dictConfig
-from typing import Dict
+from typing import Dict, Optional
 
 from rich.console import Console
 from rich.errors import MarkupError
 from rich.logging import RichHandler
 from termcolor import cprint
 
+from .distribution.datatypes import LoggingConfig
+
 # Default log level
 DEFAULT_LOG_LEVEL = logging.INFO
 
@@ -34,6 +36,56 @@ CATEGORIES = [
 _category_levels: Dict[str, int] = {category: DEFAULT_LOG_LEVEL for category in CATEGORIES}
 
 
+def config_to_category_levels(category: str, level: str):
+    """
+    Helper function to be called either by environment parsing or yaml parsing to go from a list of categories and levels to a dictionary ready to be
+    used by the logger dictConfig.
+
+    Parameters:
+        category (str): logging category to apply the level to
+        level (str): logging level to be used in the category
+
+    Returns:
+        Dict[str, int]: A dictionary mapping categories to their log levels.
+    """
+
+    category_levels: Dict[str, int] = {}
+    level_value = logging._nameToLevel.get(str(level).upper())
+    if level_value is None:
+        logging.warning(f"Unknown log level '{level}' for category '{category}'. Falling back to default 'INFO'.")
+        return category_levels
+
+    if category == "all":
+        # Apply the log level to all categories and the root logger
+        for cat in CATEGORIES:
+            category_levels[cat] = level_value
+        # Set the root logger's level to the specified level
+        category_levels["root"] = level_value
+    elif category in CATEGORIES:
+        category_levels[category] = level_value
+        logging.info(f"Setting '{category}' category to level '{level}'.")
+    else:
+        logging.warning(f"Unknown logging category: {category}. No changes made.")
+    return category_levels
+
+
+def parse_yaml_config(yaml_config: LoggingConfig) -> Dict[str, int]:
+    """
+    Helper function to parse a yaml logging configuration found in the run.yaml
+
+    Parameters:
+        yaml_config (Logging): the logger config object found in the run.yaml
+
+    Returns:
+        Dict[str, int]: A dictionary mapping categories to their log levels.
+    """
+    category_levels = {}
+    for category, level in yaml_config.category_levels.items():
+        category_levels.update(config_to_category_levels(category=category, level=level))
+
+    return category_levels
+
+
 def parse_environment_config(env_config: str) -> Dict[str, int]:
     """
     Parse the LLAMA_STACK_LOGGING environment variable and return a dictionary of category log levels.
@@ -53,25 +105,7 @@ def parse_environment_config(env_config: str) -> Dict[str, int]:
             category, level = pair.split("=", 1)
             category = category.strip().lower()
             level = level.strip().upper()  # Convert to uppercase for logging._nameToLevel
-
-            level_value = logging._nameToLevel.get(level)
-            if level_value is None:
-                logging.warning(
-                    f"Unknown log level '{level}' for category '{category}'. Falling back to default 'INFO'."
-                )
-                continue
-
-            if category == "all":
-                # Apply the log level to all categories and the root logger
-                for cat in CATEGORIES:
-                    category_levels[cat] = level_value
-                # Set the root logger's level to the specified level
-                category_levels["root"] = level_value
-            elif category in CATEGORIES:
-                category_levels[category] = level_value
-                logging.info(f"Setting '{category}' category to level '{level}'.")
-            else:
-                logging.warning(f"Unknown logging category: {category}. No changes made.")
+            category_levels.update(config_to_category_levels(category=category, level=level))
 
         except ValueError:
             logging.warning(f"Invalid logging configuration: '{pair}'. Expected format: 'category=level'.")
@@ -176,7 +210,9 @@ def setup_logging(category_levels: Dict[str, int], log_file: str | None) -> None
             logger.setLevel(root_level)
 
 
-def get_logger(name: str, category: str = "uncategorized") -> logging.LoggerAdapter:
+def get_logger(
+    name: str, category: str = "uncategorized", config: Optional[LoggingConfig] | None = None
+) -> logging.LoggerAdapter:
     """
     Returns a logger with the specified name and category.
     If no category is provided, defaults to 'uncategorized'.
@@ -184,10 +220,14 @@ def get_logger(name: str, category: str = "uncategorized") -> logging.LoggerAdap
     Parameters:
         name (str): The name of the logger (e.g., module or filename).
         category (str): The category of the logger (default 'uncategorized').
+        config (Logging): optional yaml config to override the existing logger configuration
 
     Returns:
         logging.LoggerAdapter: Configured logger with category support.
     """
+    if config:
+        _category_levels.update(parse_yaml_config(config))
+
     logger = logging.getLogger(name)
     logger.setLevel(_category_levels.get(category, DEFAULT_LOG_LEVEL))
     return logging.LoggerAdapter(logger, {"category": category})

From a626b7bce37089913d528b7f343a49e0d7350051 Mon Sep 17 00:00:00 2001
From: yyymeta <123776235+yyymeta@users.noreply.github.com>
Date: Fri, 14 Mar 2025 12:50:49 -0700
Subject: [PATCH 33/38] feat: [new open benchmark]  BFCL_v3 (#1578)

# What does this PR do?
create a new dataset BFCL_v3 from
https://gorilla.cs.berkeley.edu/blogs/13_bfcl_v3_multi_turn.html

overall each question asks the model to perform a task described in
natural language, and additionally a set of available functions and
their schema are given for the model to choose from. the model is
required to write the function call form including function name and
parameters , to achieve the stated purpose. the results are validated
against provided ground truth, to make sure that the generated function
call and the ground truth function call are syntactically and
semantically equivalent, by checking their AST .


## Test Plan

start server by

```
llama stack run ./llama_stack/templates/ollama/run.yaml
```

then send traffic
```
 llama-stack-client eval run-benchmark "bfcl"  --model-id   meta-llama/Llama-3.2-3B-Instruct    --output-dir /tmp/gpqa    --num-examples   2
```


[//]: # (## Documentation)
---
 distributions/dependencies.json               |  19 +
 .../inline/eval/meta_reference/eval.py        |   7 +-
 .../providers/inline/scoring/basic/scoring.py |   9 +-
 .../basic/scoring_fn/bfcl_scoring_fn.py       |  93 ++
 .../scoring/basic/scoring_fn/fn_defs/bfcl.py  |  21 +
 .../scoring/basic/utils/bfcl/__init__.py      |   5 +
 .../scoring/basic/utils/bfcl/ast_parser.py    | 296 ++++++
 .../scoring/basic/utils/bfcl/checker.py       | 989 ++++++++++++++++++
 .../scoring/basic/utils/bfcl/tree_sitter.py   |  40 +
 llama_stack/providers/registry/eval.py        |   2 +-
 .../utils/common/data_schema_validator.py     |  22 +
 .../open-benchmark/open_benchmark.py          |  21 +
 llama_stack/templates/open-benchmark/run.yaml |  23 +
 requirements.txt                              |   2 +-
 uv.lock                                       |   6 +-
 15 files changed, 1546 insertions(+), 9 deletions(-)
 create mode 100644 llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py
 create mode 100644 llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/bfcl.py
 create mode 100644 llama_stack/providers/inline/scoring/basic/utils/bfcl/__init__.py
 create mode 100644 llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py
 create mode 100644 llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py
 create mode 100644 llama_stack/providers/inline/scoring/basic/utils/bfcl/tree_sitter.py

diff --git a/distributions/dependencies.json b/distributions/dependencies.json
index c3f039247..d2ed12d3a 100644
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@@ -30,6 +30,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn"
   ],
   "cerebras": [
@@ -62,6 +63,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@@ -97,6 +99,7 @@
     "sqlite-vec",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@@ -132,6 +135,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@@ -168,6 +172,7 @@
     "sqlite-vec",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@@ -203,6 +208,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@@ -236,6 +242,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn"
   ],
   "hf-endpoint": [
@@ -270,6 +277,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn"
   ],
   "hf-serverless": [
@@ -304,6 +312,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@@ -344,6 +353,7 @@
     "torchvision",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "zmq"
   ],
@@ -385,6 +395,7 @@
     "torchvision",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "zmq"
   ],
@@ -417,6 +428,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn"
   ],
   "ollama": [
@@ -451,6 +463,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn"
   ],
   "open-benchmark": [
@@ -485,6 +498,7 @@
     "together",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn"
   ],
   "passthrough": [
@@ -517,6 +531,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@@ -551,6 +566,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@@ -616,6 +632,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@@ -651,6 +668,7 @@
     "together",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@@ -685,6 +703,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "vllm",
     "sentence-transformers --no-deps",
diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py
index a1bebaa4c..85b351262 100644
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@@ -12,7 +12,7 @@ from llama_stack.apis.agents import Agents, StepType
 from llama_stack.apis.benchmarks import Benchmark
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.inference import Inference, UserMessage
+from llama_stack.apis.inference import Inference, SystemMessage, UserMessage
 from llama_stack.apis.scoring import Scoring
 from llama_stack.distribution.datatypes import Api
 from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
@@ -118,7 +118,7 @@ class MetaReferenceEvalImpl(
         for i, x in tqdm(enumerate(input_rows)):
             assert ColumnName.chat_completion_input.value in x, "Invalid input row"
             input_messages = json.loads(x[ColumnName.chat_completion_input.value])
-            input_messages = [UserMessage(**x) for x in input_messages]
+            input_messages = [UserMessage(**x) for x in input_messages if x["role"] == "user"]
 
             # NOTE: only single-turn agent generation is supported. Create a new session for each input row
             session_create_response = await self.agents_api.create_agent_session(agent_id, f"session-{i}")
@@ -168,10 +168,11 @@ class MetaReferenceEvalImpl(
                 generations.append({ColumnName.generated_answer.value: response.completion_message.content})
             elif ColumnName.chat_completion_input.value in x:
                 chat_completion_input_json = json.loads(x[ColumnName.chat_completion_input.value])
-                input_messages = [UserMessage(**x) for x in chat_completion_input_json]
+                input_messages = [UserMessage(**x) for x in chat_completion_input_json if x["role"] == "user"]
                 messages = []
                 if candidate.system_message:
                     messages.append(candidate.system_message)
+                messages += [SystemMessage(**x) for x in chat_completion_input_json if x["role"] == "system"]
                 messages += input_messages
                 response = await self.inference_api.chat_completion(
                     model_id=candidate.model,
diff --git a/llama_stack/providers/inline/scoring/basic/scoring.py b/llama_stack/providers/inline/scoring/basic/scoring.py
index 00945b99d..599f5f98c 100644
--- a/llama_stack/providers/inline/scoring/basic/scoring.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring.py
@@ -22,12 +22,19 @@ from llama_stack.providers.utils.common.data_schema_validator import (
 )
 
 from .config import BasicScoringConfig
+from .scoring_fn.bfcl_scoring_fn import BFCLScoringFn
 from .scoring_fn.equality_scoring_fn import EqualityScoringFn
 from .scoring_fn.regex_parser_math_response_scoring_fn import RegexParserMathResponseScoringFn
 from .scoring_fn.regex_parser_scoring_fn import RegexParserScoringFn
 from .scoring_fn.subset_of_scoring_fn import SubsetOfScoringFn
 
-FIXED_FNS = [EqualityScoringFn, SubsetOfScoringFn, RegexParserScoringFn, RegexParserMathResponseScoringFn]
+FIXED_FNS = [
+    EqualityScoringFn,
+    SubsetOfScoringFn,
+    RegexParserScoringFn,
+    RegexParserMathResponseScoringFn,
+    BFCLScoringFn,
+]
 
 
 class BasicScoringImpl(
diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py
new file mode 100644
index 000000000..f37780f3e
--- /dev/null
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py
@@ -0,0 +1,93 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import re
+from typing import Any, Dict, Optional
+
+from llama_stack.apis.scoring import ScoringResultRow
+from llama_stack.apis.scoring_functions import ScoringFnParams
+from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
+
+from ..utils.bfcl.ast_parser import decode_ast
+from ..utils.bfcl.checker import ast_checker, is_empty_output
+from .fn_defs.bfcl import bfcl
+
+
+def postprocess(x: Dict[str, Any], test_category: str) -> Dict[str, Any]:
+    contain_func_call = False
+    error = None
+    error_type = None
+    checker_result = {}
+    try:
+        prediction = decode_ast(x["generated_answer"], x["language"]) or ""
+        contain_func_call = True
+        # if not is_function_calling_format_output(prediction):
+        if is_empty_output(prediction):
+            contain_func_call = False
+            error = "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability."
+            error_type = "ast_decoder:decoder_wrong_output_format"
+        else:
+            checker_result = ast_checker(
+                json.loads(x["function"]),
+                prediction,
+                json.loads(x["ground_truth"]),
+                x["language"],
+                test_category=test_category,
+                model_name="",
+            )
+    except Exception as e:
+        prediction = ""
+        error = f"Invalid syntax. Failed to decode AST. {str(e)}"
+        error_type = "ast_decoder:decoder_failed"
+    return {
+        "prediction": prediction,
+        "contain_func_call": contain_func_call,
+        "valid": checker_result.get("valid", False),
+        "error": error or checker_result.get("error", ""),
+        "error_type": error_type or checker_result.get("error_type", ""),
+    }
+
+
+def gen_valid(x: Dict[str, Any]) -> Dict[str, float]:
+    return {"valid": x["valid"]}
+
+
+def gen_relevance_acc(x: Dict[str, Any]) -> Dict[str, float]:
+    # This function serves for both relevance and irrelevance tests, which share the exact opposite logic.
+    # If `test_category` is "irrelevance", the model is expected to output no function call.
+    # No function call means either the AST decoding fails (a error message is generated) or the decoded AST does not contain any function call (such as a empty list, `[]`).
+    # If `test_category` is "relevance", the model is expected to output to a function call, and empty list doesn't count as a function call.
+    acc = not x["contain_func_call"] if "irrelevance" in x["id"] else x["contain_func_call"]
+    return {"valid": float(acc)}
+
+
+class BFCLScoringFn(RegisteredBaseScoringFn):
+    """
+    A scoring_fn for BFCL
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.supported_fn_defs_registry = {
+            bfcl.identifier: bfcl,
+        }
+
+    async def score_row(
+        self,
+        input_row: Dict[str, Any],
+        scoring_fn_identifier: Optional[str] = "bfcl",
+        scoring_params: Optional[ScoringFnParams] = None,
+    ) -> ScoringResultRow:
+        test_category = re.sub(r"_[0-9_-]+$", "", input_row["id"])
+        score_result = postprocess(input_row, test_category)
+        if test_category in {"irrelevance", "live_relevance", "live_irrelevance"}:
+            score = gen_relevance_acc(score_result)["valid"]
+        else:
+            score = gen_valid(score_result)["valid"]
+        return {
+            "score": float(score),
+        }
diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/bfcl.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/bfcl.py
new file mode 100644
index 000000000..392d92c86
--- /dev/null
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/bfcl.py
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.scoring_functions import (
+    AggregationFunctionType,
+    BasicScoringFnParams,
+    ScoringFn,
+)
+
+bfcl = ScoringFn(
+    identifier="basic::bfcl",
+    description="BFCL complex scoring",
+    return_type=NumberType(),
+    provider_id="basic",
+    provider_resource_id="bfcl",
+    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.accuracy]),
+)
diff --git a/llama_stack/providers/inline/scoring/basic/utils/bfcl/__init__.py b/llama_stack/providers/inline/scoring/basic/utils/bfcl/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py b/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py
new file mode 100644
index 000000000..445cdfc77
--- /dev/null
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py
@@ -0,0 +1,296 @@
+# ruff: noqa
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import ast
+
+from .tree_sitter import get_parser
+
+
+def parse_java_function_call(source_code):
+    if not source_code.endswith(";"):
+        source_code += ";"  # Necessary for the parser not to register an error
+    parser = get_parser("java")
+    tree = parser.parse(bytes(source_code, "utf8"))
+    root_node = tree.root_node
+
+    if root_node.has_error:
+        raise Exception("Error parsing java the source code.")
+
+    def get_text(node):
+        """Returns the text represented by the node."""
+        return source_code[node.start_byte : node.end_byte]
+
+    def traverse_node(node, nested=False):
+        if node.type == "string_literal":
+            if nested:
+                return get_text(node)
+            # Strip surrounding quotes from string literals
+            return get_text(node)[1:-1]
+        elif node.type == "character_literal":
+            if nested:
+                return get_text(node)
+            # Strip surrounding single quotes from character literals
+            return get_text(node)[1:-1]
+        """Traverse the node to collect texts for complex structures."""
+        if node.type in [
+            "identifier",
+            "class_literal",
+            "type_identifier",
+            "method_invocation",
+        ]:
+            return get_text(node)
+        elif node.type == "array_creation_expression":
+            # Handle array creation expression specifically
+            type_node = node.child_by_field_name("type")
+            value_node = node.child_by_field_name("value")
+            type_text = traverse_node(type_node, True)
+            value_text = traverse_node(value_node, True)
+            return f"new {type_text}[]{value_text}"
+        elif node.type == "object_creation_expression":
+            # Handle object creation expression specifically
+            type_node = node.child_by_field_name("type")
+            arguments_node = node.child_by_field_name("arguments")
+            type_text = traverse_node(type_node, True)
+            if arguments_node:
+                # Process each argument carefully, avoiding unnecessary punctuation
+                argument_texts = []
+                for child in arguments_node.children:
+                    if child.type not in [
+                        ",",
+                        "(",
+                        ")",
+                    ]:  # Exclude commas and parentheses
+                        argument_text = traverse_node(child, True)
+                        argument_texts.append(argument_text)
+                arguments_text = ", ".join(argument_texts)
+                return f"new {type_text}({arguments_text})"
+            else:
+                return f"new {type_text}()"
+        elif node.type == "set":
+            # Handling sets specifically
+            items = [traverse_node(n, True) for n in node.children if n.type not in [",", "set"]]
+            return "{" + ", ".join(items) + "}"
+
+        elif node.child_count > 0:
+            return "".join(traverse_node(child, True) for child in node.children)
+        else:
+            return get_text(node)
+
+    def extract_arguments(args_node):
+        arguments = {}
+        for child in args_node.children:
+            if child.type == "assignment_expression":
+                # For named parameters
+                name_node, value_node = child.children[0], child.children[2]
+                name = get_text(name_node)
+                value = traverse_node(value_node)
+                if name in arguments:
+                    if not isinstance(arguments[name], list):
+                        arguments[name] = [arguments[name]]
+                    arguments[name].append(value)
+                else:
+                    arguments[name] = value
+                # arguments.append({'name': name, 'value': value})
+            elif child.type in ["identifier", "class_literal", "set"]:
+                # For unnamed parameters and handling sets
+                value = traverse_node(child)
+                if None in arguments:
+                    if not isinstance(arguments[None], list):
+                        arguments[None] = [arguments[None]]
+                    arguments[None].append(value)
+                else:
+                    arguments[None] = value
+        return arguments
+
+    def traverse(node):
+        if node.type == "method_invocation":
+            # Extract the function name and its arguments
+            method_name = get_text(node.child_by_field_name("name"))
+            class_name_node = node.child_by_field_name("object")
+            if class_name_node:
+                class_name = get_text(class_name_node)
+                function_name = f"{class_name}.{method_name}"
+            else:
+                function_name = method_name
+            arguments_node = node.child_by_field_name("arguments")
+            if arguments_node:
+                arguments = extract_arguments(arguments_node)
+                for key, value in arguments.items():
+                    if isinstance(value, list):
+                        raise Exception("Error: Multiple arguments with the same name are not supported.")
+                return [{function_name: arguments}]
+
+        else:
+            for child in node.children:
+                result = traverse(child)
+                if result:
+                    return result
+
+    result = traverse(root_node)
+    return result if result else {}
+
+
+def parse_javascript_function_call(source_code):
+    if not source_code.endswith(";"):
+        source_code += ";"  # Necessary for the parser not to register an error
+    parser = get_parser("javascript")
+    # Parse the source code
+    tree = parser.parse(bytes(source_code, "utf8"))
+    root_node = tree.root_node
+    if root_node.has_error:
+        raise Exception("Error js parsing the source code.")
+
+    # Function to recursively extract argument details
+    def extract_arguments(node):
+        args = {}
+        for child in node.children:
+            if child.type == "assignment_expression":
+                # Extract left (name) and right (value) parts of the assignment
+                name = child.children[0].text.decode("utf-8")
+                value = child.children[2].text.decode("utf-8")
+                if (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")):
+                    value = value[1:-1]  # Trim the quotation marks
+                if name in args:
+                    if not isinstance(args[name], list):
+                        args[name] = [args[name]]
+                    args[name].append(value)
+                else:
+                    args[name] = value
+
+            elif child.type == "identifier" or child.type == "true":
+                # Handle non-named arguments and boolean values
+                value = child.text.decode("utf-8")
+                if None in args:
+                    if not isinstance(args[None], list):
+                        args[None] = [args[None]]
+                    args[None].append(value)
+                else:
+                    args[None] = value
+        return args
+
+    # Find the function call and extract its name and arguments
+    if root_node.type == "program":
+        for child in root_node.children:
+            if child.type == "expression_statement":
+                for sub_child in child.children:
+                    if sub_child.type == "call_expression":
+                        function_name = sub_child.children[0].text.decode("utf8")
+                        arguments_node = sub_child.children[1]
+                        parameters = extract_arguments(arguments_node)
+                        for key, value in parameters.items():
+                            if isinstance(value, list):
+                                raise Exception("Error: Multiple arguments with the same name are not supported.")
+                        result = [{function_name: parameters}]
+                        return result
+
+
+def ast_parse(input_str, language="Python"):
+    if language == "Python":
+        cleaned_input = input_str.strip("[]'")
+        parsed = ast.parse(cleaned_input, mode="eval")
+        extracted = []
+        if isinstance(parsed.body, ast.Call):
+            extracted.append(resolve_ast_call(parsed.body))
+        else:
+            for elem in parsed.body.elts:
+                extracted.append(resolve_ast_call(elem))
+        return extracted
+    elif language == "Java":
+        return parse_java_function_call(input_str[1:-1])  # Remove the [ and ] from the string
+    elif language == "JavaScript":
+        return parse_javascript_function_call(input_str[1:-1])
+    else:
+        raise NotImplementedError(f"Unsupported language: {language}")
+
+
+def resolve_ast_call(elem):
+    # Handle nested attributes for deeply nested module paths
+    func_parts = []
+    func_part = elem.func
+    while isinstance(func_part, ast.Attribute):
+        func_parts.append(func_part.attr)
+        func_part = func_part.value
+    if isinstance(func_part, ast.Name):
+        func_parts.append(func_part.id)
+    func_name = ".".join(reversed(func_parts))
+    args_dict = {}
+    # Parse when args are simply passed as an unnamed dictionary arg
+    for arg in elem.args:
+        if isinstance(arg, ast.Dict):
+            for key, value in zip(arg.keys, arg.values):
+                if isinstance(key, ast.Constant):
+                    arg_name = key.value
+                output = resolve_ast_by_type(value)
+                args_dict[arg_name] = output
+    for arg in elem.keywords:
+        output = resolve_ast_by_type(arg.value)
+        args_dict[arg.arg] = output
+    return {func_name: args_dict}
+
+
+def resolve_ast_by_type(value):
+    if isinstance(value, ast.Constant):
+        if value.value is Ellipsis:
+            output = "..."
+        else:
+            output = value.value
+    elif isinstance(value, ast.UnaryOp):
+        output = -value.operand.value
+    elif isinstance(value, ast.List):
+        output = [resolve_ast_by_type(v) for v in value.elts]
+    elif isinstance(value, ast.Dict):
+        output = {resolve_ast_by_type(k): resolve_ast_by_type(v) for k, v in zip(value.keys, value.values)}
+    elif isinstance(value, ast.NameConstant):  # Added this condition to handle boolean values
+        output = value.value
+    elif isinstance(value, ast.BinOp):  # Added this condition to handle function calls as arguments
+        output = eval(ast.unparse(value))
+    elif isinstance(value, ast.Name):
+        output = value.id
+    elif isinstance(value, ast.Call):
+        if len(value.keywords) == 0:
+            output = ast.unparse(value)
+        else:
+            output = resolve_ast_call(value)
+    elif isinstance(value, ast.Tuple):
+        output = tuple(resolve_ast_by_type(v) for v in value.elts)
+    elif isinstance(value, ast.Lambda):
+        output = eval(ast.unparse(value.body[0].value))
+    elif isinstance(value, ast.Ellipsis):
+        output = "..."
+    elif isinstance(value, ast.Subscript):
+        try:
+            output = ast.unparse(value.body[0].value)
+        except:
+            output = ast.unparse(value.value) + "[" + ast.unparse(value.slice) + "]"
+    else:
+        raise Exception(f"Unsupported AST type: {type(value)}")
+    return output
+
+
+def decode_ast(result, language="Python"):
+    func = result
+    func = func.replace("\n", "")  # remove new line characters
+    if not func.startswith("["):
+        func = "[" + func
+    if not func.endswith("]"):
+        func = func + "]"
+    decoded_output = ast_parse(func, language)
+    return decoded_output
+
+
+def decode_execute(result):
+    func = result
+    func = func.replace("\n", "")  # remove new line characters
+    if not func.startswith("["):
+        func = "[" + func
+    if not func.endswith("]"):
+        func = func + "]"
+    decode_output = ast_parse(func)
+    execution_list = []
+    for function_call in decode_output:
+        for key, value in function_call.items():
+            execution_list.append(f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})")
+    return execution_list
diff --git a/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py b/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py
new file mode 100644
index 000000000..f6aab123c
--- /dev/null
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py
@@ -0,0 +1,989 @@
+# ruff: noqa
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import json
+import re
+import time
+from typing import Any
+
+# Comment out for now until we actually use the rest checker in evals
+# import requests  # Do not remove this import even though it seems to be unused. It's used in the executable_checker_rest function.
+
+
+class NoAPIKeyError(Exception):
+    def __init__(self):
+        self.message = "❗️Please fill in the API keys in the function_credential_config.json file. If you do not provide the API keys, the executable test category results will be inaccurate."
+        super().__init__(self.message)
+
+
+REAL_TIME_MATCH_ALLOWED_DIFFERENCE = 0.2
+
+
+JAVA_TYPE_CONVERSION = {
+    "byte": int,
+    "short": int,
+    "integer": int,
+    "float": float,
+    "double": float,
+    "long": int,
+    "boolean": bool,
+    "char": str,
+    "Array": list,
+    "ArrayList": list,
+    "Set": set,
+    "HashMap": dict,
+    "Hashtable": dict,
+    "Queue": list,  # this can be `queue.Queue` as well, for simplicity we check with list
+    "Stack": list,
+    "String": str,
+    "any": str,
+}
+
+JS_TYPE_CONVERSION = {
+    "String": str,
+    "integer": int,
+    "float": float,
+    "Bigint": int,
+    "Boolean": bool,
+    "dict": dict,
+    "array": list,
+    "any": str,
+}
+
+# We switch to conditional import for the following two imports to avoid unnecessary installations.
+# User doesn't need to setup the tree-sitter packages if they are not running the test for that language.
+# from js_type_converter import js_type_converter
+# from java_type_converter import java_type_converter
+
+PYTHON_TYPE_MAPPING = {
+    "string": str,
+    "integer": int,
+    "float": float,
+    "boolean": bool,
+    "array": list,
+    "tuple": list,
+    "dict": dict,
+    "any": str,
+}
+
+# This is the list of types that we need to recursively check its values
+PYTHON_NESTED_TYPE_CHECK_LIST = ["array", "tuple"]
+
+
+NESTED_CONVERSION_TYPE_LIST = ["Array", "ArrayList", "array"]
+
+
+#### Helper functions for AST ####
+def find_description(func_descriptions, name):
+    if type(func_descriptions) == list:
+        for func_description in func_descriptions:
+            if func_description["name"] == name:
+                return func_description
+        return None
+    else:
+        # it is a dict, there is only one function
+        return func_descriptions
+
+
+def get_possible_answer_type(possible_answer: list):
+    for answer in possible_answer:
+        if answer != "":  # Optional parameter
+            return type(answer)
+    return None
+
+
+def type_checker(
+    param: str,
+    value,
+    possible_answer: list,
+    expected_type_description: str,
+    expected_type_converted,
+    nested_type_converted,
+):
+    # NOTE: This type checker only supports nested type checking for one level deep.
+    # We didn't implement recursive type checking for nested types, as it's not needed for the current use case and it's very complex.
+
+    result: Any = {
+        "valid": True,
+        "error": [],
+        "is_variable": False,
+        "error_type": "type_error:simple",
+    }
+
+    is_variable = False
+    # check for the case where a variable is used instead of a actual value.
+    # use the type in possible_answer as the expected type
+    possible_answer_type = get_possible_answer_type(possible_answer)
+    # if possible_answer only contains optional parameters, we can't determine the type
+    if possible_answer_type != None:
+        # we are being precise here.
+        # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer
+        if possible_answer_type != expected_type_converted:
+            is_variable = True
+
+    # value is the same type as in function description
+    if type(value) == expected_type_converted:
+        # We don't need to do recursive check for simple types
+        if nested_type_converted == None:
+            result["is_variable"] = is_variable
+            return result
+        else:
+            for possible_answer_item in possible_answer:
+                flag = True  # Each parameter should match to at least one possible answer type.
+                # Here, we assume that each item should be the same type. We could also relax it.
+                if type(possible_answer_item) == list:
+                    for value_item in value:
+                        checker_result = type_checker(
+                            param,
+                            value_item,
+                            possible_answer_item,
+                            str(nested_type_converted),
+                            nested_type_converted,
+                            None,
+                        )
+                        if not checker_result["valid"]:
+                            flag = False
+                            break
+
+                if flag:
+                    return {"valid": True, "error": [], "is_variable": is_variable}
+
+            result["valid"] = False
+            result["error"] = [
+                f"Nested type checking failed for parameter {repr(param)}. Expected outer type {expected_type_description} with inner type {str(nested_type_converted)}. Parameter value: {repr(value)}."
+            ]
+            result["error_type"] = "type_error:nested"
+
+    # value is not as expected, check for the case where a variable is used instead of a actual value
+    # use the type in possible_answer as the expected type
+    possible_answer_type = get_possible_answer_type(possible_answer)
+    # if possible_answer only contains optional parameters, we can't determine the type
+    if possible_answer_type != None:
+        # we are being precise here.
+        # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer
+        if type(value) == possible_answer_type:
+            result["is_variable"] = True
+            return result
+
+    result["valid"] = False
+    result["error"].append(
+        f"Incorrect type for parameter {repr(param)}. Expected type {expected_type_description}, got {type(value).__name__}. Parameter value: {repr(value)}."
+    )
+    result["error_type"] = "type_error:simple"
+    return result
+
+
+def standardize_string(input_string: str):
+    # This function standardizes the string by removing all the spaces, ",./-_*^" punctuation, and converting it to lowercase
+    # It will also convert all the single quotes to double quotes
+    # This is used to compare the model output with the possible answers
+    # We don't want to punish model for answer like April 1, 2024 vs April 1,2024, vs April 1 2024
+    regex_string = r"[ \,\.\/\-\_\*\^]"
+    return re.sub(regex_string, "", input_string).lower().replace("'", '"')
+
+
+def string_checker(param: str, model_output: str, possible_answer: list):
+    standardize_possible_answer = []
+    standardize_model_output = standardize_string(model_output)
+    for i in range(len(possible_answer)):
+        if type(possible_answer[i]) == str:
+            standardize_possible_answer.append(standardize_string(possible_answer[i]))
+
+    if standardize_model_output not in standardize_possible_answer:
+        return {
+            "valid": False,
+            "error": [
+                f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}. Case insensitive."
+            ],
+            "error_type": "value_error:string",
+        }
+
+    return {"valid": True, "error": []}
+
+
+def list_checker(param: str, model_output: list, possible_answer: list):
+    # Convert the tuple to a list
+
+    standardize_model_output = list(model_output)
+
+    # If the element in the list is a string, we need to standardize it
+    for i in range(len(standardize_model_output)):
+        if type(standardize_model_output[i]) == str:
+            standardize_model_output[i] = standardize_string(model_output[i])
+
+    standardize_possible_answer: Any = []
+    # We also need to standardize the possible answers
+    for i in range(len(possible_answer)):
+        standardize_possible_answer.append([])
+        for j in range(len(possible_answer[i])):
+            if type(possible_answer[i][j]) == str:
+                standardize_possible_answer[i].append(standardize_string(possible_answer[i][j]))
+            else:
+                standardize_possible_answer[i].append(possible_answer[i][j])
+
+    if standardize_model_output not in standardize_possible_answer:
+        return {
+            "valid": False,
+            "error": [
+                f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}."
+            ],
+            "error_type": "value_error:list/tuple",
+        }
+
+    return {"valid": True, "error": []}
+
+
+def dict_checker(param: str, model_output: dict, possible_answers: list):
+    # This function works for simple dictionaries, but not dictionaries with nested dictionaries.
+    # The current dataset only contains simple dictionaries, so this is sufficient.
+
+    result = {"valid": False, "error": [], "error_type": "dict_checker:unclear"}
+    for i in range(len(possible_answers)):
+        if possible_answers[i] == "":
+            continue
+
+        result = {"valid": False, "error": [], "error_type": "dict_checker:unclear"}
+
+        flag = True
+
+        possible_answer = possible_answers[i]
+        # possible_anwer is a single dictionary
+
+        for key, value in model_output.items():
+            if key not in possible_answer:
+                result["valid"] = False
+                result["error"].append(f"Unexpected dict key parameter: '{key}'.")  # type: ignore[attr-defined]
+                result["error_type"] = "value_error:dict_key"
+                flag = False
+                break
+
+            standardize_value = value
+            # If the value is a string, we need to standardize it
+            if type(value) == str:
+                standardize_value = standardize_string(value)
+
+            # We also need to standardize the possible answers if they are string
+            standardize_possible_answer = []
+            for i in range(len(possible_answer[key])):
+                if type(possible_answer[key][i]) == str:
+                    standardize_possible_answer.append(standardize_string(possible_answer[key][i]))
+                else:
+                    standardize_possible_answer.append(possible_answer[key][i])
+
+            if standardize_value not in standardize_possible_answer:
+                result["valid"] = False
+                result["error"].append(  # type: ignore[attr-defined]
+                    f"Invalid value for parameter {repr(key)}: {repr(value)}. Expected one of {standardize_possible_answer}."
+                )
+                result["error_type"] = "value_error:dict_value"
+                flag = False
+                break
+
+        for key, value in possible_answer.items():
+            if key not in model_output and "" not in value:
+                result["valid"] = False
+                result["error"].append(f"Missing dict key parameter: '{key}'.")  # type: ignore[attr-defined]
+                result["error_type"] = "value_error:dict_key"
+                flag = False
+                break
+
+        if flag:
+            return {"valid": True, "error": []}
+
+    return result
+
+
+def list_dict_checker(param: str, model_output: list, possible_answers: list):
+    # This function takes in a list of dictionaries and checks if each dictionary is valid
+    # The order of the dictionaries in the list must match the order of the possible answers
+
+    result = {"valid": False, "error": [], "error_type": "list_dict_checker:unclear"}
+
+    for answer_index in range(len(possible_answers)):
+        flag = True  # True means so far, all dictionaries are valid
+
+        # Only proceed if the number of dictionaries in the list matches the number of dictionaries in the possible answers
+        if len(model_output) != len(possible_answers[answer_index]):
+            result["valid"] = False
+            result["error"] = ["Wrong number of dictionaries in the list."]
+            result["error_type"] = "value_error:list_dict_count"
+            flag = False
+            continue
+
+        for dict_index in range(len(model_output)):
+            result = dict_checker(
+                param,
+                model_output[dict_index],
+                [possible_answers[answer_index][dict_index]],
+            )
+            if not result["valid"]:
+                flag = False
+                break
+        if flag:
+            return {"valid": True, "error": []}
+
+    return result
+
+
+def simple_function_checker(
+    func_description: dict,
+    model_output: dict,
+    possible_answer: dict,
+    language: str,
+    model_name: str,
+):
+    possible_answer = list(possible_answer.values())[0]
+    # Extract function name and parameters details
+    func_name = func_description["name"]
+    param_details = func_description["parameters"]["properties"]
+    required_params = func_description["parameters"]["required"]
+
+    # Initialize a result dictionary
+    result = {
+        "valid": True,
+        "error": [],
+        "error_type": "simple_function_checker:unclear",
+    }
+
+    # Check if function name matches
+    if func_name not in model_output:
+        result["valid"] = False
+        result["error"].append(  # type: ignore[attr-defined]
+            f"Function name {repr(func_name)} not found in model output."
+        )
+        result["error_type"] = "simple_function_checker:wrong_func_name"
+        return result
+
+    model_params = model_output[func_name]
+
+    # Check for required parameters in model output
+    for param in required_params:
+        if param not in model_params:
+            result["valid"] = False
+            result["error"].append(f"Missing required parameter: {repr(param)}.")  # type: ignore[attr-defined]
+            result["error_type"] = "simple_function_checker:missing_required"
+            return result
+
+    # Validate types and values for each parameter in model output
+    for param, value in model_params.items():
+        if param not in param_details or param not in possible_answer:
+            result["valid"] = False
+            result["error"].append(f"Unexpected parameter: {repr(param)}.")  # type: ignore[attr-defined]
+            result["error_type"] = "simple_function_checker:unexpected_param"
+            return result
+
+        full_param_details = param_details[param]
+        expected_type_description = full_param_details["type"]  # This is a string
+        is_variable = False
+        nested_type_converted = None
+
+        if language == "Java":
+            from evals.utils.bfcl.java_type_converter import java_type_converter
+
+            expected_type_converted = JAVA_TYPE_CONVERSION[expected_type_description]
+
+            if expected_type_description in JAVA_TYPE_CONVERSION:
+                if type(value) != str:
+                    result["valid"] = False
+                    result["error"].append(  # type: ignore[attr-defined]
+                        f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}."
+                    )
+                    result["error_type"] = "type_error:java"
+                    return result
+
+                if expected_type_description in NESTED_CONVERSION_TYPE_LIST:
+                    nested_type = param_details[param]["items"]["type"]
+                    nested_type_converted = JAVA_TYPE_CONVERSION[nested_type]
+                    value = java_type_converter(value, expected_type_description, nested_type)
+                else:
+                    value = java_type_converter(value, expected_type_description)
+
+        elif language == "JavaScript":
+            from evals.utils.bfcl.js_type_converter import js_type_converter
+
+            expected_type_converted = JS_TYPE_CONVERSION[expected_type_description]
+
+            if expected_type_description in JS_TYPE_CONVERSION:
+                if type(value) != str:
+                    result["valid"] = False
+                    result["error"].append(  # type: ignore[attr-defined]
+                        f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}."
+                    )
+                    result["error_type"] = "type_error:js"
+                    return result
+
+                if expected_type_description in NESTED_CONVERSION_TYPE_LIST:
+                    nested_type = param_details[param]["items"]["type"]
+                    nested_type_converted = JS_TYPE_CONVERSION[nested_type]
+                    value = js_type_converter(value, expected_type_description, nested_type)
+                else:
+                    value = js_type_converter(value, expected_type_description)
+
+        elif language == "Python":
+            expected_type_converted = PYTHON_TYPE_MAPPING[expected_type_description]
+            if expected_type_description in PYTHON_NESTED_TYPE_CHECK_LIST:
+                nested_type = param_details[param]["items"]["type"]
+                nested_type_converted = PYTHON_TYPE_MAPPING[nested_type]
+
+        # We convert all tuple value to list when the expected type is tuple.
+        # The conversion is necessary because any tuple in the possible answer would become a list after being processed through json.dump() and json.load().
+        # This does introduce some false positive (eg, when the model provides a list value instead of tuple). We hope to find a better solution in the future.
+        if expected_type_description == "tuple" and type(value) == tuple:
+            value = list(value)
+
+        # Allow python auto conversion from int to float
+        if language == "Python" and expected_type_description == "float" and type(value) == int:
+            value = float(value)
+
+        # Type checking
+        # In fact, we only check for Python here.
+        # Type check for other languages are handled by the type converter, and so their value (after conversion) is always correct.
+        type_check_result = type_checker(
+            param,
+            value,
+            possible_answer[param],
+            expected_type_description,
+            expected_type_converted,
+            nested_type_converted,
+        )
+        is_variable = type_check_result["is_variable"]
+        if not type_check_result["valid"]:
+            return type_check_result
+
+        # It doesn't make sense to special handle dictionaries and list of dictionaries if the value is a variable.
+        # We can just treat the variable as a string and use the normal flow.
+        if not is_variable:
+            # Special handle for dictionaries
+            if expected_type_converted == dict:
+                result = dict_checker(param, value, possible_answer[param])
+                if not result["valid"]:
+                    return result
+                continue
+
+            # Special handle for list of dictionaries
+            elif expected_type_converted == list and nested_type_converted == dict:
+                result = list_dict_checker(param, value, possible_answer[param])
+                if not result["valid"]:
+                    return result
+                continue
+
+            # Special handle for strings
+            elif expected_type_converted == str:
+                # We don't check for case sensitivity for string, as long as it's not a variable
+                result = string_checker(param, value, possible_answer[param])
+                if not result["valid"]:
+                    return result
+                continue
+
+            elif expected_type_converted == list:
+                result = list_checker(param, value, possible_answer[param])
+                if not result["valid"]:
+                    return result
+                continue
+
+        # Check if the value is within the possible answers
+        if value not in possible_answer[param]:
+            result["valid"] = False
+            result["error"].append(  # type: ignore[attr-defined]
+                f"Invalid value for parameter {repr(param)}: {repr(value)}. Expected one of {possible_answer[param]}."
+            )
+            result["error_type"] = "value_error:others"
+            return result
+
+    # Check for optional parameters not provided but allowed
+    for param in possible_answer:
+        if param not in model_params and "" not in possible_answer[param]:
+            result["valid"] = False
+            result["error"].append(  # type: ignore[attr-defined]
+                f"Optional parameter {repr(param)} not provided and not marked as optional."
+            )
+            result["error_type"] = "simple_function_checker:missing_optional"
+            return result
+
+    return result
+
+
+def parallel_function_checker_enforce_order(
+    func_descriptions: list,
+    model_output: list,
+    possible_answers: dict,
+    language: str,
+    model_name: str,
+):
+    if len(model_output) != len(possible_answers):
+        return {
+            "valid": False,
+            "error": ["Wrong number of functions."],
+            "error_type": "parallel_function_checker_enforce_order:wrong_count",
+        }
+
+    func_name_list = list(possible_answers.keys())
+    possible_answers_list = []
+
+    for key, value in possible_answers.items():
+        possible_answers_list.append({key: value})
+
+    for i in range(len(possible_answers_list)):
+        func_description = find_description(func_descriptions, func_name_list[i])
+
+        result = simple_function_checker(
+            func_description,
+            model_output[i],
+            possible_answers_list[i],
+            language,
+            model_name,
+        )
+        if not result["valid"]:
+            return result
+
+    return {"valid": True, "error": []}
+
+
+def parallel_function_checker_no_order(
+    func_descriptions: list,
+    model_output: list,
+    possible_answers: list,
+    language: str,
+    model_name: str,
+):
+    if len(model_output) != len(possible_answers):
+        return {
+            "valid": False,
+            "error": ["Wrong number of functions."],
+            "error_type": "parallel_function_checker_no_order:wrong_count",
+        }
+
+    matched_indices = []
+
+    # We go throught the possible answers one by one, and eliminate the model output that matches the possible answer
+    # It must be this way because we need ground truth to fetch the correct function description
+    for i in range(len(possible_answers)):
+        # possible_answers[i] is a dictionary with only one key
+        func_name_expected = list(possible_answers[i].keys())[0]
+        func_description = find_description(func_descriptions, func_name_expected)
+
+        all_errors = []
+
+        for index in range(len(model_output)):
+            if index in matched_indices:
+                continue
+
+            result = simple_function_checker(
+                func_description,
+                model_output[index],
+                possible_answers[i],
+                language,
+                model_name,
+            )
+
+            if result["valid"]:
+                matched_indices.append(index)
+                break
+            else:
+                all_errors.append(
+                    {
+                        f"Model Result Index {index}": {
+                            "sub_error": result["error"],
+                            "sub_error_type": result["error_type"],
+                            "model_output_item": model_output[index],
+                            "possible_answer_item": possible_answers[i],
+                        }
+                    }
+                )
+
+        if not result["valid"]:
+            considered_indices = [i for i in range(len(model_output)) if i not in matched_indices]
+            all_errors.insert(
+                0,
+                f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.",  # type: ignore[arg-type]
+            )
+            return {
+                "valid": False,
+                "error": all_errors,
+                "error_type": "parallel_function_checker_no_order:cannot_find_match",
+            }
+
+    return {"valid": True, "error": []}
+
+
+def multiple_function_checker(
+    func_descriptions: list,
+    model_output: list,
+    possible_answers: list,
+    language: str,
+    model_name: str,
+):
+    if len(model_output) != len(possible_answers):
+        return {
+            "valid": False,
+            "error": ["Wrong number of functions."],
+            "error_type": "multiple_function_checker:wrong_count",
+        }
+
+    # possible_answers is a list of only one dictionary with only one key
+    func_name_expected = list(possible_answers[0].keys())[0]
+    func_description = find_description(func_descriptions, func_name_expected)
+    return simple_function_checker(
+        func_description,
+        model_output[0],
+        possible_answers[0],
+        language,
+        model_name,
+    )
+
+
+def patten_matcher(exec_output, expected_result, function_call, is_sanity_check):
+    result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"}
+
+    if type(exec_output) != type(expected_result):
+        return {
+            "valid": False,
+            "error": [
+                f"Wrong execution result type for {repr(function_call)}. Expected type: {type(expected_result)}, but got: {type(exec_output)}."
+            ],
+            "error_type": "executable_checker:wrong_result_type",
+            "model_executed_output": exec_output,
+        }
+    if type(exec_output) == dict:
+        # We loose the requirement for the sanity check as the expected result used in the sanity check might not be the most up-to-date one.
+        # This happens when the key is a timestamp or a random number.
+        if is_sanity_check:
+            if len(exec_output) != len(expected_result):
+                return {
+                    "valid": False,
+                    "error": [
+                        f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}."
+                    ],
+                    "error_type": "executable_checker:wrong_result_type:dict_length",
+                    "model_executed_output": exec_output,
+                }
+            else:
+                return result
+
+        for key, value in expected_result.items():
+            if key not in exec_output:
+                return {
+                    "valid": False,
+                    "error": [
+                        f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not found in the model output."
+                    ],
+                    "error_type": "executable_checker:wrong_result_type:dict_key_not_found",
+                    "model_executed_output": exec_output,
+                }
+        for key, value in exec_output.items():
+            if key not in expected_result:
+                return {
+                    "valid": False,
+                    "error": [
+                        f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not expected in the model output."
+                    ],
+                    "error_type": "executable_checker:wrong_result_type:dict_extra_key",
+                    "model_executed_output": exec_output,
+                }
+    if type(exec_output) == list:
+        if len(exec_output) != len(expected_result):
+            return {
+                "valid": False,
+                "error": [
+                    f"Wrong execution result pattern for {repr(function_call)}. Expect type list, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}."
+                ],
+                "error_type": "executable_checker:wrong_result_type:list_length",
+                "model_executed_output": exec_output,
+            }
+    return result
+
+
+#### Helper functions for Exec ####
+def executable_checker_simple(
+    function_call: str,
+    expected_result,
+    expected_result_type: str,
+    is_sanity_check=False,
+):
+    result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"}
+
+    exec_dict: Any = {}
+
+    try:
+        exec(
+            "from executable_python_function import *" + "\nresult=" + function_call,
+            exec_dict,
+        )
+        exec_output = exec_dict["result"]
+    except NoAPIKeyError as e:
+        raise e
+    except Exception as e:
+        result["valid"] = False
+        result["error"].append(  # type: ignore[attr-defined]
+            f"Error in execution: {repr(function_call)}. Error: {str(e)}"
+        )
+        result["error_type"] = "executable_checker:execution_error"
+        return result
+
+    # We need to special handle the case where the execution result is a tuple and convert it to a list
+    # Because when json is stored, the tuple is converted to a list, and so the expected result is a list when loaded from json
+    if isinstance(exec_output, tuple):
+        exec_output = list(exec_output)
+
+    if expected_result_type == "exact_match":
+        if exec_output != expected_result:
+            result["valid"] = False
+            result["error"].append(  # type: ignore[attr-defined]
+                f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}."
+            )
+            result["error_type"] = "executable_checker:wrong_result"
+            result["model_executed_output"] = exec_output
+            return result
+
+    elif expected_result_type == "real_time_match":
+        # Allow for 5% difference
+        if (type(expected_result) == float or type(expected_result) == int) and (
+            type(exec_output) == float or type(exec_output) == int
+        ):
+            if not (
+                expected_result * (1 - REAL_TIME_MATCH_ALLOWED_DIFFERENCE)
+                <= exec_output
+                <= expected_result * (1 + REAL_TIME_MATCH_ALLOWED_DIFFERENCE)
+            ):
+                result["valid"] = False
+                result["error"].append(  # type: ignore[attr-defined]
+                    f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}. {REAL_TIME_MATCH_ALLOWED_DIFFERENCE * 100}% difference allowed."
+                )
+                result["error_type"] = "executable_checker:wrong_result_real_time"
+                result["model_executed_output"] = exec_output
+                return result
+        else:
+            result["valid"] = False
+            result["error"].append(  # type: ignore[attr-defined]
+                f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}. Type needs to be float or int for real time match criteria."
+            )
+            result["error_type"] = "executable_checker:wrong_result_real_time"
+            result["model_executed_output"] = exec_output
+            return result
+
+    else:
+        # structural match
+        pattern_match_result = patten_matcher(exec_output, expected_result, function_call, is_sanity_check)
+        if not pattern_match_result["valid"]:
+            return pattern_match_result
+
+    return result
+
+
+def executable_checker_parallel_no_order(
+    decoded_result: list, expected_exec_result: list, expected_exec_result_type: list
+):
+    if len(decoded_result) != len(expected_exec_result):
+        return {
+            "valid": False,
+            "error": [
+                f"Wrong number of functions provided. Expected {len(expected_exec_result)}, but got {len(decoded_result)}."
+            ],
+            "error_type": "value_error:exec_result_count",
+        }
+
+    matched_indices = []
+    for i in range(len(expected_exec_result)):
+        all_errors = []
+        for index in range(len(decoded_result)):
+            if index in matched_indices:
+                continue
+
+            result = executable_checker_simple(
+                decoded_result[index],
+                expected_exec_result[i],
+                expected_exec_result_type[i],
+                False,
+            )
+
+            if result["valid"]:
+                matched_indices.append(index)
+                break
+            else:
+                all_errors.append(
+                    {
+                        f"Model Result Index {index}": {
+                            "sub_error": result["error"],
+                            "sub_error_type": result["error_type"],
+                            "model_executed_output": (
+                                result["model_executed_output"] if "model_executed_output" in result else None
+                            ),
+                        }
+                    }
+                )
+
+        if not result["valid"]:
+            considered_indices = [i for i in range(len(decoded_result)) if i not in matched_indices]
+            all_errors.insert(
+                0,
+                f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.",  # type: ignore[arg-type]
+            )
+            return {
+                "valid": False,
+                "error": all_errors,
+                "error_type": "executable_checker:cannot_find_match",
+            }
+
+    return {"valid": True, "error": [], "error_type": "executable_checker:unclear"}
+
+
+#### Main function ####
+def executable_checker_rest(func_call, idx):
+    # Move this here for now to avoid needing to read this file / fix paths to be relative to dataset_dir. Fix when it's actually needed / used.
+    EVAL_GROUND_TRUTH_PATH = "/mnt/wsfuse/fair_llm_v2/datasets/eval/bfcl/rest-eval-response_v5.jsonl"  # Ground truth file for v5 for rest execution
+    with open(EVAL_GROUND_TRUTH_PATH, "r") as f:
+        EVAL_GROUND_TRUTH = f.readlines()
+    if "https://geocode.maps.co" in func_call:
+        time.sleep(2)
+    if "requests_get" in func_call:
+        func_call = func_call.replace("requests_get", "requests.get")
+    try:
+        response = eval(func_call)
+    except Exception as e:
+        return {
+            "valid": False,
+            "error": [f"Execution failed. {str(e)}"],
+            "error_type": "executable_checker_rest:execution_error",
+        }
+
+    try:
+        if response.status_code == 200:
+            eval_GT_json = json.loads(EVAL_GROUND_TRUTH[idx])
+            try:
+                if isinstance(eval_GT_json, dict):
+                    if isinstance(response.json(), dict):
+                        if set(eval_GT_json.keys()) == set(response.json().keys()):
+                            return {"valid": True, "error": [], "error_type": ""}
+                        return {
+                            "valid": False,
+                            "error": ["Key inconsistency"],
+                            "error_type": "executable_checker_rest:wrong_key",
+                        }
+                    return {
+                        "valid": False,
+                        "error": [f"Expected dictionary, but got {type(response.json())}"],
+                        "error_type": "executable_checker_rest:wrong_type",
+                    }
+
+                elif isinstance(eval_GT_json, list):
+                    if isinstance(response.json(), list):
+                        if len(eval_GT_json) != len(response.json()):
+                            return {
+                                "valid": False,
+                                "error": [f"Response list length inconsistency."],
+                                "error_type": "value_error:exec_result_rest_count",
+                            }
+
+                        else:
+                            for i in range(len(eval_GT_json)):
+                                if set(eval_GT_json[i].keys()) != set(response.json()[i].keys()):
+                                    return {
+                                        "valid": False,
+                                        "error": [f"Key inconsistency"],
+                                        "error_type": "executable_checker_rest:wrong_key",
+                                    }
+
+                            return {"valid": True, "error": []}
+                    else:
+                        return {
+                            "valid": False,
+                            "error": [f"Expected list, but got {type(response.json())}"],
+                            "error_type": "executable_checker_rest:wrong_type",
+                        }
+                return {
+                    "valid": False,
+                    "error": [f"Expected dict or list, but got {type(response.json())}"],
+                    "error_type": "executable_checker_rest:wrong_type",
+                }
+            except Exception as e:
+                return {
+                    "valid": False,
+                    "error": [
+                        f"Error in execution and type checking. Status code: {response.status_code}. Error: {str(e)}"
+                    ],
+                    "error_type": "executable_checker_rest:response_format_error",
+                }
+        else:
+            return {
+                "valid": False,
+                "error": [f"Execution result status code is not 200, got {response.status_code}"],
+                "error_type": "executable_checker_rest:wrong_status_code",
+            }
+    except Exception as e:
+        return {
+            "valid": False,
+            "error": [f"Cannot get status code of the response. Error: {str(e)}"],
+            "error_type": "executable_checker_rest:cannot_get_status_code",
+        }
+
+
+def ast_checker(func_description, model_output, possible_answer, language, test_category, model_name):
+    if "parallel" in test_category:
+        return parallel_function_checker_no_order(func_description, model_output, possible_answer, language, model_name)
+
+    elif "multiple" in test_category:
+        return multiple_function_checker(func_description, model_output, possible_answer, language, model_name)
+
+    else:
+        if len(model_output) != 1:
+            return {
+                "valid": False,
+                "error": ["Wrong number of functions."],
+                "error_type": "simple_function_checker:wrong_count",
+            }
+
+        return simple_function_checker(
+            func_description[0],
+            model_output[0],
+            possible_answer[0],
+            language,
+            model_name,
+        )
+
+
+def exec_checker(decoded_result: list, func_description: dict, test_category: str):
+    if "multiple" in test_category or "parallel" in test_category:
+        return executable_checker_parallel_no_order(
+            decoded_result,
+            func_description["execution_result"],
+            func_description["execution_result_type"],
+        )
+
+    else:
+        if len(decoded_result) != 1:
+            return {
+                "valid": False,
+                "error": ["Wrong number of functions."],
+                "error_type": "simple_exec_checker:wrong_count",
+            }
+        return executable_checker_simple(
+            decoded_result[0],
+            func_description["execution_result"][0],
+            func_description["execution_result_type"][0],
+            False,
+        )
+
+
+def is_empty_output(decoded_output):
+    # This function is a patch to the ast decoder for relevance detection
+    # Sometimes the ast decoder will parse successfully, but the input doens't really have a function call
+    # [], [{}], and anything that is not in function calling format is considered empty (and thus should be marked as correct)
+    if not is_function_calling_format_output(decoded_output):
+        return True
+    if len(decoded_output) == 0:
+        return True
+    if len(decoded_output) == 1 and len(decoded_output[0]) == 0:
+        return True
+
+
+def is_function_calling_format_output(decoded_output):
+    # Ensure the output is a list of dictionaries
+    if type(decoded_output) == list:
+        for item in decoded_output:
+            if type(item) != dict:
+                return False
+        return True
+    return False
diff --git a/llama_stack/providers/inline/scoring/basic/utils/bfcl/tree_sitter.py b/llama_stack/providers/inline/scoring/basic/utils/bfcl/tree_sitter.py
new file mode 100644
index 000000000..ed97ee360
--- /dev/null
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/tree_sitter.py
@@ -0,0 +1,40 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+Tree-sitter changes its API with unfortunate frequency. Modules that need it should
+import it from here so that we can centrally manage things as necessary.
+"""
+
+# These currently work with tree-sitter 0.23.0
+# NOTE: Don't import tree-sitter or any of the language modules in the main module
+# because not all environments have them. Import lazily inside functions where needed.
+
+import importlib
+import typing
+
+if typing.TYPE_CHECKING:
+    import tree_sitter
+
+
+def get_language(language: str) -> "tree_sitter.Language":
+    import tree_sitter
+
+    language_module_name = f"tree_sitter_{language}"
+    try:
+        language_module = importlib.import_module(language_module_name)
+    except ModuleNotFoundError as exc:
+        raise ValueError(
+            f"Language {language} is not found. Please install the tree-sitter-{language} package."
+        ) from exc
+    return tree_sitter.Language(language_module.language())
+
+
+def get_parser(language: str, **kwargs) -> "tree_sitter.Parser":
+    import tree_sitter
+
+    lang = get_language(language)
+    return tree_sitter.Parser(lang, **kwargs)
diff --git a/llama_stack/providers/registry/eval.py b/llama_stack/providers/registry/eval.py
index 6901c3741..755d30382 100644
--- a/llama_stack/providers/registry/eval.py
+++ b/llama_stack/providers/registry/eval.py
@@ -14,7 +14,7 @@ def available_providers() -> List[ProviderSpec]:
         InlineProviderSpec(
             api=Api.eval,
             provider_type="inline::meta-reference",
-            pip_packages=[],
+            pip_packages=["tree_sitter"],
             module="llama_stack.providers.inline.eval.meta_reference",
             config_class="llama_stack.providers.inline.eval.meta_reference.MetaReferenceEvalConfig",
             api_dependencies=[
diff --git a/llama_stack/providers/utils/common/data_schema_validator.py b/llama_stack/providers/utils/common/data_schema_validator.py
index 3d14c4148..eb9d9dd60 100644
--- a/llama_stack/providers/utils/common/data_schema_validator.py
+++ b/llama_stack/providers/utils/common/data_schema_validator.py
@@ -23,6 +23,10 @@ class ColumnName(Enum):
     generated_answer = "generated_answer"
     context = "context"
     dialog = "dialog"
+    function = "function"
+    language = "language"
+    id = "id"
+    ground_truth = "ground_truth"
 
 
 VALID_SCHEMAS_FOR_SCORING = [
@@ -37,6 +41,15 @@ VALID_SCHEMAS_FOR_SCORING = [
         ColumnName.generated_answer.value: StringType(),
         ColumnName.context.value: StringType(),
     },
+    {
+        ColumnName.input_query.value: StringType(),
+        ColumnName.expected_answer.value: StringType(),
+        ColumnName.generated_answer.value: StringType(),
+        ColumnName.function.value: StringType(),
+        ColumnName.language.value: StringType(),
+        ColumnName.id.value: StringType(),
+        ColumnName.ground_truth.value: StringType(),
+    },
 ]
 
 VALID_SCHEMAS_FOR_EVAL = [
@@ -50,6 +63,15 @@ VALID_SCHEMAS_FOR_EVAL = [
         ColumnName.expected_answer.value: StringType(),
         ColumnName.completion_input.value: CompletionInputType(),
     },
+    {
+        ColumnName.input_query.value: StringType(),
+        ColumnName.expected_answer.value: StringType(),
+        ColumnName.generated_answer.value: StringType(),
+        ColumnName.function.value: StringType(),
+        ColumnName.language.value: StringType(),
+        ColumnName.id.value: StringType(),
+        ColumnName.ground_truth.value: StringType(),
+    },
 ]
 
 
diff --git a/llama_stack/templates/open-benchmark/open_benchmark.py b/llama_stack/templates/open-benchmark/open_benchmark.py
index 2b40797f9..17f5b8ee7 100644
--- a/llama_stack/templates/open-benchmark/open_benchmark.py
+++ b/llama_stack/templates/open-benchmark/open_benchmark.py
@@ -226,6 +226,22 @@ def get_distribution_template() -> DistributionTemplate:
                 "chat_completion_input": {"type": "string"},
             },
         ),
+        DatasetInput(
+            dataset_id="bfcl",
+            provider_id="huggingface",
+            url=URL(uri="https://huggingface.co/datasets/llamastack/bfcl_v3"),
+            metadata={
+                "path": "llamastack/bfcl_v3",
+                "split": "train",
+            },
+            dataset_schema={
+                "function": {"type": "string"},
+                "language": {"type": "string"},
+                "ground_truth": {"type": "string"},
+                "id": {"type": "string"},
+                "chat_completion_input": {"type": "string"},
+            },
+        ),
     ]
 
     default_benchmarks = [
@@ -249,6 +265,11 @@ def get_distribution_template() -> DistributionTemplate:
             dataset_id="math_500",
             scoring_functions=["basic::regex_parser_math_response"],
         ),
+        BenchmarkInput(
+            benchmark_id="meta-reference-bfcl",
+            dataset_id="bfcl",
+            scoring_functions=["basic::bfcl"],
+        ),
     ]
     return DistributionTemplate(
         name=name,
diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml
index 5ef25435b..6961f8022 100644
--- a/llama_stack/templates/open-benchmark/run.yaml
+++ b/llama_stack/templates/open-benchmark/run.yaml
@@ -216,6 +216,24 @@ datasets:
     split: test
   dataset_id: math_500
   provider_id: huggingface
+- dataset_schema:
+    function:
+      type: string
+    language:
+      type: string
+    ground_truth:
+      type: string
+    id:
+      type: string
+    chat_completion_input:
+      type: string
+  url:
+    uri: https://huggingface.co/datasets/llamastack/bfcl_v3
+  metadata:
+    path: llamastack/bfcl_v3
+    split: train
+  dataset_id: bfcl
+  provider_id: huggingface
 scoring_fns: []
 benchmarks:
 - dataset_id: simpleqa
@@ -238,6 +256,11 @@ benchmarks:
   - basic::regex_parser_math_response
   metadata: {}
   benchmark_id: meta-reference-math-500
+- dataset_id: bfcl
+  scoring_functions:
+  - basic::bfcl
+  metadata: {}
+  benchmark_id: meta-reference-bfcl
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/requirements.txt b/requirements.txt
index ae8a0af9f..3c382ad84 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,7 +12,7 @@ distro==1.9.0
 exceptiongroup==1.2.2 ; python_full_version < '3.11'
 filelock==3.17.0
 fire==0.7.0
-fsspec==2025.2.0
+fsspec==2024.12.0
 h11==0.14.0
 httpcore==1.0.7
 httpx==0.28.1
diff --git a/uv.lock b/uv.lock
index 9ec3680f8..207f5981f 100644
--- a/uv.lock
+++ b/uv.lock
@@ -769,11 +769,11 @@ wheels = [
 
 [[package]]
 name = "fsspec"
-version = "2025.2.0"
+version = "2024.12.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/b5/79/68612ed99700e6413de42895aa725463e821a6b3be75c87fcce1b4af4c70/fsspec-2025.2.0.tar.gz", hash = "sha256:1c24b16eaa0a1798afa0337aa0db9b256718ab2a89c425371f5628d22c3b6afd", size = 292283 }
+sdist = { url = "https://files.pythonhosted.org/packages/ee/11/de70dee31455c546fbc88301971ec03c328f3d1138cfba14263f651e9551/fsspec-2024.12.0.tar.gz", hash = "sha256:670700c977ed2fb51e0d9f9253177ed20cbde4a3e5c0283cc5385b5870c8533f", size = 291600 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/e2/94/758680531a00d06e471ef649e4ec2ed6bf185356a7f9fbfbb7368a40bd49/fsspec-2025.2.0-py3-none-any.whl", hash = "sha256:9de2ad9ce1f85e1931858535bc882543171d197001a0a5eb2ddc04f1781ab95b", size = 184484 },
+    { url = "https://files.pythonhosted.org/packages/de/86/5486b0188d08aa643e127774a99bac51ffa6cf343e3deb0583956dca5b22/fsspec-2024.12.0-py3-none-any.whl", hash = "sha256:b520aed47ad9804237ff878b504267a3b0b441e97508bd6d2d8774e3db85cee2", size = 183862 },
 ]
 
 [package.optional-dependencies]

From c5857a9b50253ac2d2dbe0d270a3ea75fc0bf6f6 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Fri, 14 Mar 2025 14:45:37 -0700
Subject: [PATCH 34/38] fix: sleep between tests oof

---
 tests/integration/conftest.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index bf1092c4a..22290b519 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -8,6 +8,7 @@ import itertools
 import os
 import platform
 import textwrap
+import time
 
 from dotenv import load_dotenv
 
@@ -18,6 +19,12 @@ from .report import Report
 logger = get_logger(__name__, category="tests")
 
 
+def pytest_runtest_teardown(item):
+    interval_seconds = os.getenv("LLAMA_STACK_TEST_INTERVAL_SECONDS")
+    if interval_seconds:
+        time.sleep(float(interval_seconds))
+
+
 def pytest_configure(config):
     config.option.tbstyle = "short"
     config.option.disable_warnings = True

From 93cfade8c9929a9cb5c7082d1824c96550af20e3 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Fri, 14 Mar 2025 15:21:26 -0700
Subject: [PATCH 35/38] ci: Bump version to 0.1.7

---
 pyproject.toml   |  4 ++--
 requirements.txt |  2 +-
 uv.lock          | 10 +++++-----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index ff7f46f77..4a5befbd0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "llama_stack"
-version = "0.1.6"
+version = "0.1.7"
 authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
 description = "Llama Stack"
 readme = "README.md"
@@ -27,7 +27,7 @@ dependencies = [
     "huggingface-hub",
     "jinja2>=3.1.6",
     "jsonschema",
-    "llama-stack-client>=0.1.6",
+    "llama-stack-client>=0.1.7",
     "prompt-toolkit",
     "python-dotenv",
     "pydantic>=2",
diff --git a/requirements.txt b/requirements.txt
index 3c382ad84..cb75e2c43 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,7 +21,7 @@ idna==3.10
 jinja2==3.1.6
 jsonschema==4.23.0
 jsonschema-specifications==2024.10.1
-llama-stack-client==0.1.6
+llama-stack-client==0.1.7
 lxml==5.3.1
 markdown-it-py==3.0.0
 markupsafe==3.0.2
diff --git a/uv.lock b/uv.lock
index 207f5981f..860b29241 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1176,7 +1176,7 @@ wheels = [
 
 [[package]]
 name = "llama-stack"
-version = "0.1.6"
+version = "0.1.7"
 source = { editable = "." }
 dependencies = [
     { name = "blobfile" },
@@ -1270,7 +1270,7 @@ requires-dist = [
     { name = "jinja2", specifier = ">=3.1.6" },
     { name = "jinja2", marker = "extra == 'codegen'", specifier = ">=3.1.6" },
     { name = "jsonschema" },
-    { name = "llama-stack-client", specifier = ">=0.1.6" },
+    { name = "llama-stack-client", specifier = ">=0.1.7" },
     { name = "mcp", marker = "extra == 'test'" },
     { name = "myst-parser", marker = "extra == 'docs'" },
     { name = "nbval", marker = "extra == 'dev'" },
@@ -1318,7 +1318,7 @@ provides-extras = ["dev", "unit", "test", "docs", "codegen"]
 
 [[package]]
 name = "llama-stack-client"
-version = "0.1.6"
+version = "0.1.7"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -1335,9 +1335,9 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/b5/48/70ffdc7ab655234794e9559de9b1776b39610c09aaee8d3bc74bfbd570b4/llama_stack_client-0.1.6.tar.gz", hash = "sha256:92c6c55c3281839e690df7bfc289c36a5dde0f491574bbdb6b8b665dc3d5a16c", size = 264874 }
+sdist = { url = "https://files.pythonhosted.org/packages/12/82/cd3ac4cddfeb2c63dc25b2469ded66e39101a01586bb2535bfae4293dc49/llama_stack_client-0.1.7.tar.gz", hash = "sha256:8ed51d73a62848a72e1c862d8881e64785be1a9118bcea94aa910025571b34d0", size = 242071 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/38/51/1102914f819cf4412a5c9fd3f7dcc28175608e5f01ee164885972c3ec30b/llama_stack_client-0.1.6-py3-none-any.whl", hash = "sha256:708e20630d4e97a1cb03a19b933f4da6748cc857fe170998c392cf0f30f0f4c7", size = 373941 },
+    { url = "https://files.pythonhosted.org/packages/dc/ee/055282258899f25f61d248a335e6bcdceb76e3f6ed76ad3a28240ebfa0fe/llama_stack_client-0.1.7-py3-none-any.whl", hash = "sha256:db89a21811310f9249a951a25f96298affa3ca1d3610a518093c94e984105292", size = 273882 },
 ]
 
 [[package]]

From 7b81761a56a2adb16327c5a9aea76d89a332b79d Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Fri, 14 Mar 2025 15:46:45 -0700
Subject: [PATCH 36/38] fix: update CDN url for stoplight

---
 docs/_static/llama-stack-spec.html             | 4 ++--
 docs/openapi_generator/pyopenapi/template.html | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index b5e4097d9..4f1b09140 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -6,8 +6,8 @@
     <meta name="viewport" content="width=device-width, initial-scale=1">
     <title>OpenAPI specification</title>
     <link href="https://fonts.googleapis.com/css?family=Montserrat:300,400,700|Roboto:300,400,700" rel="stylesheet">
-    <script type="module" src="https://unpkg.com/@stoplight/elements/web-components.min.js"></script>
-    <link rel="stylesheet" href="https://unpkg.com/@stoplight/elements/styles.min.css">
+    <script type="module" src="https://cdn.jsdelivr.net/npm/@stoplight/elements/web-components.min.js"></script>
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@stoplight/elements/styles.min.css">
     <style>
         body {
             margin: 0;
diff --git a/docs/openapi_generator/pyopenapi/template.html b/docs/openapi_generator/pyopenapi/template.html
index aaf7c00d8..5848f364e 100644
--- a/docs/openapi_generator/pyopenapi/template.html
+++ b/docs/openapi_generator/pyopenapi/template.html
@@ -6,8 +6,8 @@
     <meta name="viewport" content="width=device-width, initial-scale=1">
     <title>OpenAPI specification</title>
     <link href="https://fonts.googleapis.com/css?family=Montserrat:300,400,700|Roboto:300,400,700" rel="stylesheet">
-    <script type="module" src="https://unpkg.com/@stoplight/elements/web-components.min.js"></script>
-    <link rel="stylesheet" href="https://unpkg.com/@stoplight/elements/styles.min.css">
+    <script type="module" src="https://cdn.jsdelivr.net/npm/@stoplight/elements/web-components.min.js"></script>
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@stoplight/elements/styles.min.css">
     <style>
         body {
             margin: 0;

From d2dda4af645efd551e87a8f7dfd36f70e8830d06 Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <31703736+nathan-weinberg@users.noreply.github.com>
Date: Fri, 14 Mar 2025 19:00:55 -0400
Subject: [PATCH 37/38] docs: add additional guidance around using `virtualenv`
 (#1642)

# What does this PR do?
current docs are very tailored to `conda`

also adds guidance around running code examples within virtual
environment for both `conda` and `virtualenv`

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 docs/source/getting_started/index.md | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/docs/source/getting_started/index.md b/docs/source/getting_started/index.md
index 2dd6dc079..7e4446393 100644
--- a/docs/source/getting_started/index.md
+++ b/docs/source/getting_started/index.md
@@ -88,11 +88,19 @@ docker run -it \
 
 :::{dropdown} Installing the Llama Stack client CLI and SDK
 
-You can interact with the Llama Stack server using various client SDKs. We will use the Python SDK which you can install using the following command. Note that you must be using Python 3.10 or newer:
+You can interact with the Llama Stack server using various client SDKs.  Note that you must be using Python 3.10 or newer. We will use the Python SDK which you can install via `conda` or `virtualenv`.
+
+For `conda`:
 ```bash
 yes | conda create -n stack-client python=3.10
 conda activate stack-client
+pip install llama-stack-client
+```
 
+For `virtualenv`:
+```bash
+python -m venv stack-client
+source stack-client/bin/activate
 pip install llama-stack-client
 ```
 
@@ -173,6 +181,13 @@ response = client.inference.chat_completion(
 print(response.completion_message.content)
 ```
 
+To run the above example, put the code in a file called `inference.py`, ensure your `conda` or `virtualenv` environment is active, and run the following:
+```bash
+pip install llama_stack
+llama stack build --template ollama --image-type <conda|venv>
+python inference.py
+```
+
 ### 4. Your first RAG agent
 
 Here is an example of a simple RAG (Retrieval Augmented Generation) chatbot agent which can answer questions about TorchTune documentation.
@@ -273,6 +288,13 @@ for prompt in user_prompts:
         log.print()
 ```
 
+To run the above example, put the code in a file called `rag.py`, ensure your `conda` or `virtualenv` environment is active, and run the following:
+```bash
+pip install llama_stack
+llama stack build --template ollama --image-type <conda|venv>
+python rag.py
+```
+
 ## Next Steps
 
 - Learn more about Llama Stack [Concepts](../concepts/index.md)

From b56b06037cc53986d785dfdd6dfb179fd9013b85 Mon Sep 17 00:00:00 2001
From: Chirag Modi <98582575+cmodi-meta@users.noreply.github.com>
Date: Fri, 14 Mar 2025 17:06:07 -0700
Subject: [PATCH 38/38] Web updates to point to latest releases for Mobile SDK
 (#1650)

# What does this PR do?
Web updates to point to latest releases for Mobile SDK

- point to `latest-release` branch for mobile sdk repos to minimize the
number of change points on the site.
- updates to some instructions
---
 docs/source/concepts/index.md                 |  2 +-
 .../ondevice_distro/android_sdk.md            | 20 +++----
 .../distributions/ondevice_distro/ios_sdk.md  | 52 ++-----------------
 docs/source/index.md                          |  4 +-
 4 files changed, 18 insertions(+), 60 deletions(-)

diff --git a/docs/source/concepts/index.md b/docs/source/concepts/index.md
index 9dee2b859..a94511a0d 100644
--- a/docs/source/concepts/index.md
+++ b/docs/source/concepts/index.md
@@ -71,4 +71,4 @@ While there is a lot of flexibility to mix-and-match providers, often users will
 **Locally Hosted Distro**: You may want to run Llama Stack on your own hardware. Typically though, you still need to use Inference via an external service. You can use providers like HuggingFace TGI, Fireworks, Together, etc. for this purpose. Or you may have access to GPUs and can run a [vLLM](https://github.com/vllm-project/vllm) or [NVIDIA NIM](https://build.nvidia.com/nim?filters=nimType%3Anim_type_run_anywhere&q=llama) instance. If you "just" have a regular desktop machine, you can use [Ollama](https://ollama.com/) for inference. To provide convenient quick access to these options, we provide a number of such pre-configured locally-hosted Distros.
 
 
-**On-device Distro**: Finally, you may want to run Llama Stack directly on an edge device (mobile phone or a tablet.) We provide Distros for iOS and Android (coming soon.)
+**On-device Distro**: To run Llama Stack directly on an edge device (mobile phone or a tablet), we provide Distros for [iOS](https://llama-stack.readthedocs.io/en/latest/distributions/ondevice_distro/ios_sdk.html) and [Android](https://llama-stack.readthedocs.io/en/latest/distributions/ondevice_distro/android_sdk.html)
diff --git a/docs/source/distributions/ondevice_distro/android_sdk.md b/docs/source/distributions/ondevice_distro/android_sdk.md
index 412665ef3..8ec8df6ee 100644
--- a/docs/source/distributions/ondevice_distro/android_sdk.md
+++ b/docs/source/distributions/ondevice_distro/android_sdk.md
@@ -8,12 +8,12 @@ Features:
 - Remote Inferencing: Perform inferencing tasks remotely with Llama models hosted on a remote connection (or serverless localhost).
 - Simple Integration: With easy-to-use APIs, a developer can quickly integrate Llama Stack in their Android app. The difference with local vs remote inferencing is also minimal.
 
-Latest Release Notes: [v0.0.58](https://github.com/meta-llama/llama-stack-client-kotlin/releases/tag/v0.0.58)
+Latest Release Notes: [link](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release)
 
 *Tagged releases are stable versions of the project. While we strive to maintain a stable main branch, it's not guaranteed to be free of bugs or issues.*
 
 ## Android Demo App
-Check out our demo app to see how to integrate Llama Stack into your Android app: [Android Demo App](https://github.com/meta-llama/llama-stack-apps/tree/android-kotlin-app-latest/examples/android_app)
+Check out our demo app to see how to integrate Llama Stack into your Android app: [Android Demo App](https://github.com/meta-llama/llama-stack-client-kotlin/tree/examples/android_app)
 
 The key files in the app are `ExampleLlamaStackLocalInference.kt`, `ExampleLlamaStackRemoteInference.kts`, and `MainActivity.java`. With encompassed business logic, the app shows how to use Llama Stack for both the environments.
 
@@ -24,7 +24,7 @@ The key files in the app are `ExampleLlamaStackLocalInference.kt`, `ExampleLlama
 Add the following dependency in your `build.gradle.kts` file:
 ```
 dependencies {
- implementation("com.llama.llamastack:llama-stack-client-kotlin:0.0.58")
+ implementation("com.llama.llamastack:llama-stack-client-kotlin:0.1.4.2")
 }
 ```
 This will download jar files in your gradle cache in a directory like `~/.gradle/caches/modules-2/files-2.1/com.llama.llamastack/`
@@ -36,13 +36,13 @@ If you plan on doing remote inferencing this is sufficient to get started.
 For local inferencing, it is required to include the ExecuTorch library into your app.
 
 Include the ExecuTorch library by:
-1. Download the `download-prebuilt-et-lib.sh` script file from the [llama-stack-client-kotlin-client-local](https://github.com/meta-llama/llama-stack-client-kotlin/blob/release/0.0.58/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh) directory to your local machine.
+1. Download the `download-prebuilt-et-lib.sh` script file from the [llama-stack-client-kotlin-client-local](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh) directory to your local machine.
 2. Move the script to the top level of your Android app where the app directory resides:
 <p align="center">
-<img src="https://raw.githubusercontent.com/meta-llama/llama-stack-client-kotlin/refs/heads/release/0.0.58/doc/img/example_android_app_directory.png" style="width:300px">
+<img src="https://github.com/meta-llama/llama-stack-client-kotlin/blob/latest-release/doc/img/example_android_app_directory.png" style="width:300px">
 </p>
 
-3. Run `sh download-prebuilt-et-lib.sh` to create an `app/libs` directory and download the `executorch.aar` in that path. This generates an ExecuTorch library for the XNNPACK delegate with commit: [0a12e33](https://github.com/pytorch/executorch/commit/0a12e33d22a3d44d1aa2af5f0d0673d45b962553).
+3. Run `sh download-prebuilt-et-lib.sh` to create an `app/libs` directory and download the `executorch.aar` in that path. This generates an ExecuTorch library for the XNNPACK delegate.
 4. Add the `executorch.aar` dependency in your `build.gradle.kts` file:
 ```
 dependencies {
@@ -58,12 +58,12 @@ Breaking down the demo app, this section will show the core pieces that are used
 ### Setup Remote Inferencing
 Start a Llama Stack server on localhost. Here is an example of how you can do this using the firework.ai distribution:
 ```
-conda create -n stack-fireworks python=3.10
+conda create -n stack-fireworks python=3.10 
 conda activate stack-fireworks
-pip install llama-stack=0.0.58
+pip install --no-cache llama-stack==0.1.4
 llama stack build --template fireworks --image-type conda
 export FIREWORKS_API_KEY=<SOME_KEY>
-llama stack run /Users/<your_username>/.llama/distributions/llamastack-fireworks/fireworks-run.yaml --port=5050
+llama stack run fireworks --port 5050
 ```
 
 Ensure the Llama Stack server version is the same as the Kotlin SDK Library for maximum compatibility.
@@ -146,7 +146,7 @@ The purpose of this section is to share more details with users that would like
 ### Prerequisite
 
 You must complete the following steps:
-1. Clone the repo (`git clone https://github.com/meta-llama/llama-stack-client-kotlin.git -b release/0.0.58`)
+1. Clone the repo (`git clone https://github.com/meta-llama/llama-stack-client-kotlin.git -b latest-release`)
 2. Port the appropriate ExecuTorch libraries over into your Llama Stack Kotlin library environment.
 ```
 cd llama-stack-client-kotlin-client-local
diff --git a/docs/source/distributions/ondevice_distro/ios_sdk.md b/docs/source/distributions/ondevice_distro/ios_sdk.md
index ffaf74533..de4002eba 100644
--- a/docs/source/distributions/ondevice_distro/ios_sdk.md
+++ b/docs/source/distributions/ondevice_distro/ios_sdk.md
@@ -1,9 +1,8 @@
 # iOS SDK
 
-We offer both remote and on-device use of Llama Stack in Swift via two components:
-
-1. [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift/)
-2. [LocalInferenceImpl](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/ios/inference)
+We offer both remote and on-device use of Llama Stack in Swift via a single SDK [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift/) that contains two components:
+1. LlamaStackClient for remote
+2. Local Inference for on-device
 
 ```{image} ../../../_static/remote_or_local.gif
 :alt: Seamlessly switching between local, on-device inference and remote hosted inference
@@ -42,7 +41,7 @@ let request = Components.Schemas.CreateAgentTurnRequest(
       // ...
 ```
 
-Check out [iOSCalendarAssistant](https://github.com/meta-llama/llama-stack-apps/tree/main/examples/ios_calendar_assistant) for a complete app demo.
+Check out [iOSCalendarAssistant](https://github.com/meta-llama/llama-stack-client-swift/tree/main/examples/ios_calendar_assistant) for a complete app demo.
 
 ## LocalInference
 
@@ -58,7 +57,7 @@ let inference = LocalInference(queue: runnerQueue)
 let agents = LocalAgents(inference: self.inference)
 ```
 
-Check out [iOSCalendarAssistantWithLocalInf](https://github.com/meta-llama/llama-stack-apps/tree/main/examples/ios_calendar_assistant) for a complete app demo.
+Check out [iOSCalendarAssistantWithLocalInf](https://github.com/meta-llama/llama-stack-client-swift/tree/main/examples/ios_calendar_assistant) for a complete app demo.
 
 ### Installation
 
@@ -68,47 +67,6 @@ We're working on making LocalInference easier to set up. For now, you'll need t
 1. Install [Cmake](https://cmake.org/) for the executorch build`
 1. Drag `LocalInference.xcodeproj` into your project
 1. Add `LocalInference` as a framework in your app target
-1. Add a package dependency on https://github.com/pytorch/executorch (branch latest)
-1. Add all the kernels / backends from executorch (but not exectuorch itself!) as frameworks in your app target:
-    - backend_coreml
-    - backend_mps
-    - backend_xnnpack
-    - kernels_custom
-    - kernels_optimized
-    - kernels_portable
-    - kernels_quantized
-1. In "Build Settings" > "Other Linker Flags" > "Any iOS Simulator SDK", add:
-    ```
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libkernels_optimized-simulator-release.a
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libkernels_custom-simulator-release.a
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libkernels_quantized-simulator-release.a
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-simulator-release.a
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libbackend_coreml-simulator-release.a
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libbackend_mps-simulator-release.a
-    ```
-
-1. In "Build Settings" > "Other Linker Flags" > "Any iOS SDK", add:
-
-    ```
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libkernels_optimized-simulator-release.a
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libkernels_custom-simulator-release.a
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libkernels_quantized-simulator-release.a
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-simulator-release.a
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libbackend_coreml-simulator-release.a
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libbackend_mps-simulator-release.a
-    ```
 
 ### Preparing a model
 
diff --git a/docs/source/index.md b/docs/source/index.md
index af381f9c9..0a8fcb30c 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -38,9 +38,9 @@ We have a number of client-side SDKs available for different languages.
 |  **Language** |  **Client SDK** | **Package** |
 | :----: | :----: | :----: |
 | Python |  [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [![PyPI version](https://img.shields.io/pypi/v/llama_stack_client.svg)](https://pypi.org/project/llama_stack_client/)
-| Swift  | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
+| Swift  | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift/tree/latest-release) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
 | Node   | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
-| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)
+| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)
 
 ## Supported Llama Stack Implementations