From 3d71e5a03695715e12852de43ce787ef3420863b Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Thu, 6 Mar 2025 14:46:29 -0800
Subject: [PATCH] test: recordable mocks use json only (#1443)

# Summary:
removes the use of pickle

# Test Plan:
Run the following with `--record-responses` first, then another time
without.

LLAMA_STACK_CONFIG=fireworks pytest -s -v
tests/integration/agents/test_agents.py --safety-shield
meta-llama/Llama-Guard-3-8B --text-model
meta-llama/Llama-3.1-8B-Instruct
---
 tests/integration/fixtures/common.py          |     2 +-
 tests/integration/fixtures/recordable_mock.py |   172 +-
 .../recorded_responses/chat_completion.json   | 49050 ++++++++--------
 .../recorded_responses/chat_completion.pickle |   Bin 888589 -> 0 bytes
 .../recorded_responses/invoke_tool.json       |  1003 +-
 .../recorded_responses/invoke_tool.pickle     |   Bin 67524 -> 0 bytes
 6 files changed, 23792 insertions(+), 26435 deletions(-)
 delete mode 100644 tests/integration/fixtures/recorded_responses/chat_completion.pickle
 delete mode 100644 tests/integration/fixtures/recorded_responses/invoke_tool.pickle

diff --git a/tests/integration/fixtures/common.py b/tests/integration/fixtures/common.py
index a30f85076..6a75b3adf 100644
--- a/tests/integration/fixtures/common.py
+++ b/tests/integration/fixtures/common.py
@@ -59,7 +59,7 @@ def llama_stack_client_with_mocked_inference(llama_stack_client, request):
         return llama_stack_client
 
     record_responses = request.config.getoption("--record-responses")
-    cache_dir = Path(__file__).parent / "fixtures" / "recorded_responses"
+    cache_dir = Path(__file__).parent / "recorded_responses"
 
     # Create a shallow copy of the client to avoid modifying the original
     client = copy.copy(llama_stack_client)
diff --git a/tests/integration/fixtures/recordable_mock.py b/tests/integration/fixtures/recordable_mock.py
index d8704a0d5..d71426336 100644
--- a/tests/integration/fixtures/recordable_mock.py
+++ b/tests/integration/fixtures/recordable_mock.py
@@ -3,10 +3,12 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import importlib
 import json
 import os
-import pickle
 import re
+from datetime import datetime
+from enum import Enum
 from pathlib import Path
 
 
@@ -15,18 +17,18 @@ class RecordableMock:
 
     def __init__(self, real_func, cache_dir, func_name, record=False):
         self.real_func = real_func
-        self.pickle_path = Path(cache_dir) / f"{func_name}.pickle"
         self.json_path = Path(cache_dir) / f"{func_name}.json"
         self.record = record
         self.cache = {}
 
         # Load existing cache if available and not recording
-        if self.pickle_path.exists():
+        if self.json_path.exists():
             try:
-                with open(self.pickle_path, "rb") as f:
-                    self.cache = pickle.load(f)
+                with open(self.json_path, "r") as f:
+                    self.cache = json.load(f)
             except Exception as e:
-                print(f"Error loading cache from {self.pickle_path}: {e}")
+                print(f"Error loading cache from {self.json_path}: {e}")
+                raise
 
     async def __call__(self, *args, **kwargs):
         """
@@ -98,23 +100,19 @@ class RecordableMock:
             # Check if it's a value or chunks
             if cached_data.get("type") == "value":
                 # It's a regular value
-                return cached_data["value"]
+                return self._reconstruct_object(cached_data["value"])
             else:
                 # It's chunks from an async generator
                 async def replay_generator():
                     for chunk in cached_data["chunks"]:
-                        yield chunk
+                        yield self._reconstruct_object(chunk)
 
                 return replay_generator()
 
     def _create_cache_key(self, args, kwargs):
         """Create a hashable key from the function arguments, ignoring auto-generated IDs."""
-        # Convert args and kwargs to a string representation directly
-        args_str = str(args)
-        kwargs_str = str(sorted([(k, kwargs[k]) for k in kwargs]))
-
-        # Combine into a single key
-        key = f"{args_str}_{kwargs_str}"
+        # Convert to JSON strings with sorted keys
+        key = json.dumps((args, kwargs), sort_keys=True, default=self._json_default)
 
         # Post-process the key with regex to replace IDs with placeholders
         # Replace UUIDs and similar patterns
@@ -126,83 +124,95 @@ class RecordableMock:
         return key
 
     def _save_cache(self):
-        """Save the cache to disk in both pickle and JSON formats."""
-        os.makedirs(self.pickle_path.parent, exist_ok=True)
+        """Save the cache to disk in JSON format."""
+        os.makedirs(self.json_path.parent, exist_ok=True)
 
-        # Save as pickle for exact object preservation
-        with open(self.pickle_path, "wb") as f:
-            pickle.dump(self.cache, f)
-
-        # Also save as JSON for human readability and diffing
+        # Write the JSON file with pretty formatting
         try:
-            # Create a simplified version of the cache for JSON
-            json_cache = {}
-            for key, value in self.cache.items():
-                if value.get("type") == "generator":
-                    # For generators, create a simplified representation of each chunk
-                    chunks = []
-                    for chunk in value["chunks"]:
-                        chunk_dict = self._object_to_json_safe_dict(chunk)
-                        chunks.append(chunk_dict)
-                    json_cache[key] = {"type": "generator", "chunks": chunks}
-                else:
-                    # For values, create a simplified representation
-                    val = value["value"]
-                    val_dict = self._object_to_json_safe_dict(val)
-                    json_cache[key] = {"type": "value", "value": val_dict}
-
-            # Write the JSON file with pretty formatting
             with open(self.json_path, "w") as f:
-                json.dump(json_cache, f, indent=2, sort_keys=True)
+                json.dump(self.cache, f, indent=2, sort_keys=True, default=self._json_default)
+                # write another empty line at the end of the file to make pre-commit happy
+                f.write("\n")
         except Exception as e:
             print(f"Error saving JSON cache: {e}")
 
-    def _object_to_json_safe_dict(self, obj):
-        """Convert an object to a JSON-safe dictionary."""
-        # Handle enum types
-        if hasattr(obj, "value") and hasattr(obj.__class__, "__members__"):
-            return {"__enum__": obj.__class__.__name__, "value": obj.value}
+    def _json_default(self, obj):
+        """Default function for JSON serialization of objects."""
+
+        if isinstance(obj, datetime):
+            return {
+                "__datetime__": obj.isoformat(),
+                "__module__": obj.__class__.__module__,
+                "__class__": obj.__class__.__name__,
+            }
+
+        if isinstance(obj, Enum):
+            return {
+                "__enum__": obj.__class__.__name__,
+                "value": obj.value,
+                "__module__": obj.__class__.__module__,
+            }
 
         # Handle Pydantic models
         if hasattr(obj, "model_dump"):
-            return self._process_dict(obj.model_dump())
-        elif hasattr(obj, "dict"):
-            return self._process_dict(obj.dict())
+            model_data = obj.model_dump()
+            return {
+                "__pydantic__": obj.__class__.__name__,
+                "__module__": obj.__class__.__module__,
+                "data": model_data,
+            }
 
-        # Handle regular objects with __dict__
-        try:
-            return self._process_dict(vars(obj))
-        except Exception as e:
-            print(f"Error converting object to JSON-safe dict: {e}")
-            # If we can't get a dict, convert to string
-            return str(obj)
+    def _reconstruct_object(self, data):
+        """Reconstruct an object from its JSON representation."""
+        if isinstance(data, dict):
+            # Check if this is a serialized datetime
+            if "__datetime__" in data:
+                try:
+                    module_name = data.get("__module__", "datetime")
+                    class_name = data.get("__class__", "datetime")
 
-    def _process_dict(self, d):
-        """Process a dictionary to make all values JSON-safe."""
-        if not isinstance(d, dict):
-            return d
+                    # Try to import the specific datetime class
+                    module = importlib.import_module(module_name)
+                    dt_class = getattr(module, class_name)
 
-        result = {}
-        for k, v in d.items():
-            if isinstance(v, dict):
-                result[k] = self._process_dict(v)
-            elif isinstance(v, list):
-                result[k] = [
-                    self._process_dict(item)
-                    if isinstance(item, dict)
-                    else self._object_to_json_safe_dict(item)
-                    if hasattr(item, "__dict__")
-                    else item
-                    for item in v
-                ]
-            elif hasattr(v, "value") and hasattr(v.__class__, "__members__"):
-                # Handle enum
-                result[k] = {"__enum__": v.__class__.__name__, "value": v.value}
-            elif hasattr(v, "__dict__"):
-                # Handle nested objects
-                result[k] = self._object_to_json_safe_dict(v)
-            else:
-                # Basic types
-                result[k] = v
+                    # Parse the ISO format string
+                    dt = dt_class.fromisoformat(data["__datetime__"])
+                    return dt
+                except (ImportError, AttributeError, ValueError) as e:
+                    print(f"Error reconstructing datetime: {e}")
+                    return data
 
-        return result
+            # Check if this is a serialized enum
+            elif "__enum__" in data:
+                try:
+                    module_name = data.get("__module__", "builtins")
+                    enum_class = self._import_class(module_name, data["__enum__"])
+                    return enum_class(data["value"])
+                except (ImportError, AttributeError) as e:
+                    print(f"Error reconstructing enum: {e}")
+                    return data
+
+            # Check if this is a serialized Pydantic model
+            elif "__pydantic__" in data:
+                try:
+                    module_name = data.get("__module__", "builtins")
+                    model_class = self._import_class(module_name, data["__pydantic__"])
+                    return model_class(**self._reconstruct_object(data["data"]))
+                except (ImportError, AttributeError) as e:
+                    print(f"Error reconstructing Pydantic model: {e}")
+                    return data
+
+            # Regular dictionary
+            return {k: self._reconstruct_object(v) for k, v in data.items()}
+
+        # Handle lists
+        elif isinstance(data, list):
+            return [self._reconstruct_object(item) for item in data]
+
+        # Return primitive types as is
+        return data
+
+    def _import_class(self, module_name, class_name):
+        """Import a class from a module."""
+        module = __import__(module_name, fromlist=[class_name])
+        return getattr(module, class_name)
diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.json b/tests/integration/fixtures/recorded_responses/chat_completion.json
index 9e70e3df0..e19cd8ba3 100644
--- a/tests/integration/fixtures/recorded_responses/chat_completion.json
+++ b/tests/integration/fixtures/recorded_responses/chat_completion.json
@@ -1,26140 +1,23384 @@
 {
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant Always respond with tool calls no matter what. '), UserMessage(role='user', content='Get the boiling point of polyjuice with a tool call.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='get_boiling_point', arguments={'liquid_name': 'polyjuice', 'celcius': 'true'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='get_boiling_point', content='-100'), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='get_boiling_point', arguments={'liquid_name': 'polyjuice', 'celcius': 'false'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='get_boiling_point', content='-100')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='str', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"false\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}": {
     "chunks": [
       {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " boiling point of polyjuice is -100 degrees Fahrenheit.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant Always respond with tool calls no matter what. '), UserMessage(role='user', content='Get the boiling point of polyjuice with a tool call.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='get_boiling_point', arguments={'liquid_name': 'polyjuice', 'celcius': 'true'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='get_boiling_point', content='-100'), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='get_boiling_point', arguments={'liquid_name': 'polyjuice', 'celcius': 'false'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='get_boiling_point', content='-100')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " boiling point of polyjuice is -100 degrees",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Fahrenheit.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant Always respond with tool calls no matter what. '), UserMessage(role='user', content='Get the boiling point of polyjuice with a tool call.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='get_boiling_point', arguments={'liquid_name': 'polyjuice', 'celcius': 'true'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='get_boiling_point', content='-100')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='str', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "{\"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "type\": \"function\", \"name\": \"get_boiling_point",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "\", \"parameters\": {\"liquid_name\":",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " \"polyjuice\", \"cel",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "cius\": \"false\"}}",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
             },
-            "tool_call": {
-              "arguments": {
-                "celcius": "false",
-                "liquid_name": "polyjuice"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " boiling point of polyjuice is -100",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " degrees Fahrenheit.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
               },
-              "call_id": "dc0f86d3-2b7a-45b0-8e58-8f49c9942190",
-              "tool_name": "get_boiling_point"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant Always respond with tool calls no matter what. '), UserMessage(role='user', content='Get the boiling point of polyjuice with a tool call.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='get_boiling_point', arguments={'liquid_name': 'polyjuice', 'celcius': 'true'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='get_boiling_point', content='-100')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "{\"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "type\": \"function\", \"name\": \"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "get_boiling_point\", \"parameters",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "\": {\"liquid_name\": \"polyjuice\", \"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "celcius\": \"false\"}}",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "celcius": "false",
-                "liquid_name": "polyjuice"
+              "metric": "prompt_tokens",
+              "span_id": "9ksjMloe",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:58.345129+00:00",
+                "__module__": "datetime"
               },
-              "call_id": "00c0968b-d7d4-450d-a6ff-03d64ae9f772",
-              "tool_name": "get_boiling_point"
+              "trace_id": "6aGYLk4UShyrQ7uz",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 139
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant Always respond with tool calls no matter what. '), UserMessage(role='user', content='Get the boiling point of polyjuice with a tool call.', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='str', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "{\"type\": \"function\", \"name\": \"get_bo",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "iling_point\", \"parameters\": {\"liquid_name\": \"poly",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "juice\", \"celcius\": \"true\"}}",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "celcius": "true",
-                "liquid_name": "polyjuice"
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
               },
-              "call_id": "510ca34b-5ba9-4d5f-9ff3-c56de756fc95",
-              "tool_name": "get_boiling_point"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant Always respond with tool calls no matter what. '), UserMessage(role='user', content='Get the boiling point of polyjuice with a tool call.', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"polyjuice",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\", \"celcius\": \"true\"}}",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "celcius": "true",
-                "liquid_name": "polyjuice"
+              "metric": "completion_tokens",
+              "span_id": "9ksjMloe",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:58.345170+00:00",
+                "__module__": "datetime"
               },
-              "call_id": "eda85f20-da80-4e11-a0e4-3849159ae70f",
-              "tool_name": "get_boiling_point"
+              "trace_id": "6aGYLk4UShyrQ7uz",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 23
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Call get_boiling_point and answer What is the boiling point of polyjuice?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='get_boiling_point', arguments={'liquid_name': 'polyjuice', 'celcius': 'true'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='get_boiling_point', content='-100')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='str', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " boiling point of polyjuice is -100\u00b0C.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Call get_boiling_point and answer What is the boiling point of polyjuice?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='get_boiling_point', arguments={'liquid_name': 'polyjuice', 'celcius': 'true'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='get_boiling_point', content='-100')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " boiling point of polyjuice is -100\u00b0C.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Call get_boiling_point and answer What is the boiling point of polyjuice?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='get_boiling_point_with_metadata', arguments={'liquid_name': 'polyjuice', 'celcius': 'true'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='get_boiling_point_with_metadata', content='-100')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point_with_metadata', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " boiling point of polyjuice is -100\u00b0C.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Call get_boiling_point and answer What is the boiling point of polyjuice?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='str', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\", \"parameters\": {\"liquid_name\": \"",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "polyjuice\", \"celcius\": \"true\"}}",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "celcius": "true",
-                "liquid_name": "polyjuice"
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
               },
-              "call_id": "ac699f8a-43ca-4f0b-abd4-0597722b42ee",
-              "tool_name": "get_boiling_point"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Call get_boiling_point and answer What is the boiling point of polyjuice?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "{\"type\": \"function\", \"name",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\": \"get_boiling_point\", \"parameters",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\": {\"liquid_name\": \"poly",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "juice\", \"celcius\": \"true\"}}",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "celcius": "true",
-                "liquid_name": "polyjuice"
+              "metric": "total_tokens",
+              "span_id": "9ksjMloe",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:58.345177+00:00",
+                "__module__": "datetime"
               },
-              "call_id": "8b8b3ad5-5e47-4f56-a823-e2d82fa72d9c",
-              "tool_name": "get_boiling_point"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
+              "trace_id": "6aGYLk4UShyrQ7uz",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 162
+            }
+          ]
+        }
       }
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Call get_boiling_point and answer What is the boiling point of polyjuice?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point_with_metadata', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}": {
     "chunks": [
       {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
+          "metrics": null
+        }
       },
       {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
             },
-            "tool_call": "",
-            "type": "tool_call"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
+          "metrics": null
+        }
       },
       {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"get_boiling_point",
+              "type": "text"
             },
-            "tool_call": "{\"type\": \"function\", \"name\": \"",
-            "type": "tool_call"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
+          "metrics": null
+        }
       },
       {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\", \"parameters\": {\"liquid_name\":",
+              "type": "text"
             },
-            "tool_call": "get_boiling_point_with_metadata\", \"",
-            "type": "tool_call"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
+          "metrics": null
+        }
       },
       {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " \"polyjuice\", \"celcius\": \"false\"}}",
+              "type": "text"
             },
-            "tool_call": "parameters\": {\"liquid_name\": \"poly",
-            "type": "tool_call"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
+          "metrics": null
+        }
       },
       {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "juice\", \"celcius\": \"true\"}}",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "celcius": "true",
-                "liquid_name": "polyjuice"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
               },
-              "call_id": "3438f2d7-895f-4a94-8e1f-c2f01860ce88",
-              "tool_name": "get_boiling_point_with_metadata"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Give me a sentence that contains the word: hello', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " customer smiled and said \"hello\" to the friendly store clerk",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\ndf.head()'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.info())\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " error message indicates that the `bwrap.core` module",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " is not found. This is likely because the `bwrap` package is not installed",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ". To fix this, you can install the `bwrap",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "` package using pip:\n\n```\npip install bwrap\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "```\n\nHowever, since the `bwrap",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "` package is not a real package, you can ignore this error",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " and continue with the code.\n\nThe code above will print a summary of",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the CSV file, including the number of non-null values in each column",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ", the data types of each column, and a summary of the central",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " tendency and dispersion of each numeric column.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\ndf.head()'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/c",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "z/vyh7y1d11xg881lsxsshnc5",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "c0000gn/T/tmpkbnyoruj/fzDfY",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "IPeinflation.csv\")\nprint(df.info())\n",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "print(df.describe())",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpkbnyoruj/fzDfYIPeinflation.csv\")\nprint(df.info())\nprint(df.describe())"
+              "tool_call": {
+                "arguments": {
+                  "celcius": "false",
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "55492018-ad19-4593-9171-2b5dc2089960",
+                "tool_name": "get_boiling_point"
               },
-              "call_id": "3fb76365-1f1f-4d06-a7d2-970ad7108e2b",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
+              "type": "tool_call"
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())\\nprint(df.info())\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " error message indicates that the `b",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "wrap.core` module is not found",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ". This is likely because the `",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "bwrap` package is not installed",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ". To fix this, you can install the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " `bwrap` package using pip:\n\n```\npip install",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " bwrap\n```\n\nHowever, if",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " you don't have the `bwrap` package installed,",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " you can't use the `",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "b",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "wrap.core` module.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " In this case, you can",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " try to load the CSV file using the `p",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "andas` library directly.\n\nHere is the corrected code:\n\n```",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "python\nimport pandas as pd\ndf = pd.read_csv",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "(\"/var/folders/cz/vyh7y1d11x",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "g881lsxsshnc5c000",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "0gn/T/tmp8d5c",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "8spc/zOZSE5",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "zcinflation.csv\")\nprint(df.head())\nprint(df.info())\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "print(df.describe())\n```\n\nThis code will",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " load the CSV file and print the first few rows, information about",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the data, and summary statistics.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())\\nprint(df.info())\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " error message indicates that the `bwrap.core` module",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " is not found. This is likely",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " because the `bwrap` package is not installed. To fix this",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ", you can install the `bwrap` package using pip:\n\n```\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "pip install bwrap\n```\n\nHowever, if you don't have",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " permission to install packages, you can use the `knowledge_search` function to",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " get information about the CSV file instead:\n\n```\n{\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "    \"type\": \"function\",\n    \"name\": \"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "knowledge_search\",\n    \"parameters\": {\n       ",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " \"query\": \"describe a csv file\"\n    }\n}\n``",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "`\n\nThis will return a description of the CSV file.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
             },
-            "tool_call": "",
-            "type": "tool_call"
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
+          "metrics": null
+        }
       },
       {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
             },
-            "tool_call": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/c",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
             },
-            "tool_call": "z/vyh7y1d11xg881lsxsshnc",
-            "type": "tool_call"
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "5c0000gn/T/tmp8d5c8spc",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "/zOZSE5zcinflation.csv\")\nprint(df.head())\nprint",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "(df.info())\nprint(df.describe())",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp8d5c8spc/zOZSE5zcinflation.csv\")\nprint(df.head())\nprint(df.info())\nprint(df.describe())"
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
               },
-              "call_id": "09b4d9a1-8ee4-4de4-a5a3-91cad464e668",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "import pandas as pd\ndf = pd.read",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "_csv(\"/var/folders/cz/v",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "yh7y1d11xg881",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "lsxsshnc5c0000gn/T/tmpn9tl",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "gts1/qYsQ3ZJLinflation.csv",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\")\nprint(df.head())\nprint(df.info())\nprint(df.describe())",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpn9tlgts1/qYsQ3ZJLinflation.csv\")\nprint(df.head())\nprint(df.info())\nprint(df.describe())"
+              "metric": "prompt_tokens",
+              "span_id": "vTzYAYfO",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:56.985637+00:00",
+                "__module__": "datetime"
               },
-              "call_id": "6c3c4895-55a7-4083-b5d1-6ee42bcbe5fa",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
+              "trace_id": "H8ytqaQLQXe6sEEJ",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 91
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "I",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "'m unable to access the file you provided. However, I can",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " suggest a general approach to describe a CSV file",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".\n\nYou can use the pandas",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " library in Python to load and inspect the CSV",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " file. Here's a general outline of the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " steps you can follow:\n\n1. Import the pandas library:",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " `import pandas as pd`\n2. Load the CSV file",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " into a dataframe: `df = pd.read_csv('file.csv",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "')`\n3. Print the first few rows",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " of the dataframe: `print(df.head())`\n4",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ". Print the data types of each column",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ": `print(df.dtypes)`\n5",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ". Print the summary statistics of the dataframe:",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " `print(df.describe())`\n\nThis will give you a",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " general idea of the structure and content of the CSV file.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " If you need more specific information, you can use other pandas functions",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " to inspect the dataframe.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "import pandas as pd\nimport code_interpreter\n\n#",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " Load the CSV file\ndf = pd.read_csv(\"/",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "var/folders/cz/vyh7y",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "1d11xg881lsxsshnc5c000",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "0gn/T/tmpjxdo91ce/g1r3",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "WGZRinflation.csv\")\n\n# Print the first few rows of",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " the dataframe\nprint(df.head())\n\n#",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " Print the data types of each column",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\nprint(df.dtypes)\n\n# Print the summary statistics",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " of the dataframe\nprint(df.describe())",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpjxdo91ce/g1r3WGZRinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())"
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
               },
-              "call_id": "fbc1b233-207f-4f7b-8298-8d72a86d6f2c",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "import pandas as pd\ndf = pd.read_csv",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "(\"/var/folders/cz/vyh7y1d11x",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "g881lsxsshnc5c0000gn/T",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "/tmp8d5c8spc/zOZSE5zcin",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "flation.csv\")\nprint(df.head())",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp8d5c8spc/zOZSE5zcinflation.csv\")\nprint(df.head())"
+              "metric": "completion_tokens",
+              "span_id": "vTzYAYfO",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:56.985707+00:00",
+                "__module__": "datetime"
               },
-              "call_id": "c19a0d1e-6b44-408f-9839-819436425778",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
+              "trace_id": "H8ytqaQLQXe6sEEJ",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 45
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "import pandas as pd\ndf = pd.read_csv(\"/",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "var/folders/cz/vyh7",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "y1d11xg881lsxsshnc5c0000",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "gn/T/tmpn9tlgts1",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "/qYsQ3ZJLin",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "flation.csv\")\nprint(df.head())",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpn9tlgts1/qYsQ3ZJLinflation.csv\")\nprint(df.head())"
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
               },
-              "call_id": "e6c48b40-6504-4043-b3fa-644bd7fafd0f",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If the file is too large to be uploaded, you can provide a sample of the file or the code you used to create the file. \\n\\nHere is an example of how you can describe a csv file using pandas:\\n\\n```\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv(\\'inflation.csv\\')\\n# Print the first 5 rows of the data\\nprint(df.head())\\n# Print the last 5 rows of the data\\nprint(df.tail())\\n# Print the summary statistics of the data\\nprint(df.describe())\\n# Print the data types of each column\\nprint(df.dtypes)\\n```\\n\\nThis will give you an idea of what the csv file contains.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert 'date' column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot the time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "This",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " code will create a line plot of the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " average yearly inflation over time. The x-axis",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " represents the year and the y-axis represents",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the average inflation. The plot will also",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " include a title, labels",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " for the x and y axes, and a grid to make it",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " easier to read.\n\nPlease note that you need to replace '",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "inflation.csv' with the actual path",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " to your csv file. Also, this code assumes that the csv",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " file has a column named 'date' and",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " another column named 'inflation'. If your csv file has",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " different column names, you need to adjust the code accordingly.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If the file is too large to be uploaded, you can provide a sample of the file or the code you used to create the file. \\n\\nHere is an example of how you can describe a csv file using pandas:\\n\\n```\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv(\\'inflation.csv\\')\\n# Print the first 5 rows of the data\\nprint(df.head())\\n# Print the last 5 rows of the data\\nprint(df.tail())\\n# Print the summary statistics of the data\\nprint(df.describe())\\n# Print the data types of each column\\nprint(df.dtypes)\\n```\\n\\nThis will give you an idea of what the csv file contains.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " data\ndf = pd.read_csv('inflation.csv')\n\n#",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " Convert 'date' column to datetime\ndf['",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "date'] = pd.to_datetime(df",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "['date'])\n\n# Group by year and",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " calculate average inflation\naverage_inflation =",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " df.groupby(df['date'].dt.year)['inflation'].mean",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "()\n\n# Plot the time series\nplt.figure(figsize=(10,",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "6))\nplt.plot(average_inflation.index, average_inflation",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": ".values, marker='o')\nplt.title('Average Yearly In",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "flation')\nplt.xlabel('Year')\nplt.ylabel('Average In",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "flation')\nplt.grid(True)\nplt.show()",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv('inflation.csv')\n\n# Convert 'date' column to datetime\ndf['date'] = pd.to_datetime(df['date'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\n\n# Plot the time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()"
+              "metric": "total_tokens",
+              "span_id": "vTzYAYfO",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:56.985718+00:00",
+                "__module__": "datetime"
               },
-              "call_id": "6b6c11d8-75d5-4b34-b97b-ee523c7a8168",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
+              "trace_id": "H8ytqaQLQXe6sEEJ",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 136
+            }
+          ]
+        }
       }
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are running this code in a notebook, you can use the `upload` button to upload the file. If you are running this code in a script, you need to provide the file path.\\n\\nHere is an example of how you can describe the csv file if you have it in the same directory as your script:\\n\\n```python\\nimport pandas as pd\\n\\n# Load data\\ndf = pd.read_csv(\\'inflation.csv\\')\\n\\n# Print summary of the data\\nprint(df.head())  # Print the first few rows of the data\\nprint(df.info())  # Print information about the data\\nprint(df.describe())  # Print summary statistics about the data\\n```\\n\\nThis will print the first few rows of the data, information about the data, and summary statistics about the data.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert date column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}": {
     "chunks": [
       {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "This",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " code will create a time series plot of the average yearly inflation.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " The x-axis represents the year",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " and the y-axis represents the average inflation",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ". The plot will show the trend of average yearly inflation over the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " years.\n\nPlease note that you need to replace 'inflation.csv",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "' with the actual path to your csv file. Also, this",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " code assumes that the csv file has a column named 'date'",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " and another column named 'inflation'. If",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " your csv file has different column names, you",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " need to adjust the code accordingly.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are running this code in a notebook, you can use the `upload` button to upload the file. If you are running this code in a script, you need to provide the file path.\\n\\nHere is an example of how you can describe the csv file if you have it in the same directory as your script:\\n\\n```python\\nimport pandas as pd\\n\\n# Load data\\ndf = pd.read_csv(\\'inflation.csv\\')\\n\\n# Print summary of the data\\nprint(df.head())  # Print the first few rows of the data\\nprint(df.info())  # Print information about the data\\nprint(df.describe())  # Print summary statistics about the data\\n```\\n\\nThis will print the first few rows of the data, information about the data, and summary statistics about the data.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
             },
-            "tool_call": "",
-            "type": "tool_call"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
+          "metrics": null
+        }
       },
       {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " data\ndf = pd.read_csv('inflation.csv')\n\n#",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " Convert date column to datetime\ndf['date'] = pd.to",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "_datetime(df['date'])\n\n# Group by year and calculate average",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " inflation\naverage_inflation = df.groupby(df['date'].",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "dt.year)['inflation'].mean()\n\n# Plot",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " time series\nplt.figure(figsize=(10,6))\nplt",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": ".plot(average_inflation.index, average_inflation.values, marker='",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "o')\nplt.title('Average Yearly Inflation')\nplt.xlabel",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "('Year')\nplt.ylabel('",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "Average Inflation')\nplt.grid(True)\n",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "plt.show()",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv('inflation.csv')\n\n# Convert date column to datetime\ndf['date'] = pd.to_datetime(df['date'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\n\n# Plot time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
               },
-              "call_id": "81d7a873-376b-438e-916d-d5454e6ed09e",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
+              "tool_call": "",
+              "type": "tool_call"
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a local file, you can use the `load_data` function from the `code_interpreter` library to load the file. \\n\\nHere is an example of how you can do it:\\n\\n```\\nimport pandas as pd\\nfrom code_interpreter import load_data\\n\\n# Load data\\ndf = load_data(\\'inflation.csv\\')\\n\\n# Print summary of the data\\nprint(df.head())\\nprint(df.info())\\nprint(df.describe())\\n```\\n\\nThis will load the csv file and print the first few rows, a summary of the data, and some descriptive statistics. \\n\\nPlease replace \\'inflation.csv\\' with the actual path to your csv file. \\n\\nIf you are using a remote file, you need to provide the actual file path or the file itself. \\n\\nPlease provide the actual file path or the file itself, and I will be happy to help you describe it.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\"inflation.csv\")\\n\\n# Convert date column to datetime\\ndf[\\'date\\'] = pd.to_datetime(df[\\'date\\'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df[\\'date\\'].dt.year)[\\'inflation\\'].mean()\\n\\n# Plot time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker=\\'o\\')\\nplt.title(\\'Average Yearly Inflation\\')\\nplt.xlabel(\\'Year\\')\\nplt.ylabel(\\'Average Inflation\\')\\nplt.grid(True)\\nplt.show()'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "It",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " seems that the file \"inflation.csv\" does not exist.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " \n\nTo plot the average yearly inflation as a time series, you",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " need to provide the actual file path or the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " file itself. If you are using a local file, you can",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " use the `load_data` function from the `code_interpreter",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "` library to load the file. \n\nHere is an example of how",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " you can do it:\n\n```\nimport pandas as pd\nfrom code_inter",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "preter import load_data\n\n# Load data\ndf",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " = load_data('inflation.csv')\n\n#",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Convert date column to datetime\ndf['date'] = pd.to",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "_datetime(df['date'])\n\n# Group by year",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " and calculate average inflation\naverage_inflation = df.groupby(df['date",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "'].dt.year)['inflation'].mean()\n\n# Plot time series\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "plt.figure(figsize=(10,6))\nplt.plot(average_inflation",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".index, average_inflation.values, marker",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "='o')\nplt.title('Average Yearly Inflation')\nplt",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".xlabel('Year')\nplt.ylabel('Average Inflation')\nplt",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".grid(True)\nplt.show()\n```\n\nThis",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " will load the csv file, convert the date column to datetime",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ", group by year and calculate the average inflation, and then plot the time",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " series.\n\nPlease replace 'inflation.csv' with the actual path to your",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " csv file. \n\nIf you are using a",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " remote file, you need to provide the actual file path or the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " file itself. \n\nPlease provide the actual file path or the file itself,",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " and I will be happy to",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " help you plot the average yearly inflation as a time series.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a local file, you can use the `load_data` function from the `code_interpreter` library to load the file. \\n\\nHere is an example of how you can do it:\\n\\n```\\nimport pandas as pd\\nfrom code_interpreter import load_data\\n\\n# Load data\\ndf = load_data(\\'inflation.csv\\')\\n\\n# Print summary of the data\\nprint(df.head())\\nprint(df.info())\\nprint(df.describe())\\n```\\n\\nThis will load the csv file and print the first few rows, a summary of the data, and some descriptive statistics. \\n\\nPlease replace \\'inflation.csv\\' with the actual path to your csv file. \\n\\nIf you are using a remote file, you need to provide the actual file path or the file itself. \\n\\nPlease provide the actual file path or the file itself, and I will be happy to help you describe it.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
             },
-            "tool_call": "",
-            "type": "tool_call"
+            "logprobs": null,
+            "stop_reason": null
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
+          "metrics": null
+        }
       },
       {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "import pandas as pd\nimport matplotlib.pyplot as",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " plt\n\n# Load data\ndf = pd.read_csv(\"",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "inflation.csv\")\n\n# Convert date column to",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " datetime\ndf['date'] = pd.to_datetime(df['",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "date'])\n\n# Group by year and calculate average inflation\naverage_in",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "flation = df.groupby(df['date'].dt.year)['inflation",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "'].mean()\n\n# Plot time series\nplt.figure",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "(figsize=(10,6))\nplt.plot",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "(average_inflation.index, average_inflation.values, marker='",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "o')\nplt.title('Average Yearly Inflation')\nplt.xlabel",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": ")\nplt.show()",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"inflation.csv\")\n\n# Convert date column to datetime\ndf['date'] = pd.to_datetime(df['date'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\n\n# Plot time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
               },
-              "call_id": "da4cf054-6301-4408-85a8-35f15d1ff698",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
+              "type": "tool_call"
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a remote server or a local machine, you can use the `pd.read_csv()` function to load the csv file. \\n\\nHere is an example:\\n\\n```python\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv(\\'inflation.csv\\')\\n# Print the first 5 rows of the dataframe\\nprint(df.head())\\n# Print the summary of the dataframe\\nprint(df.info())\\nprint(df.describe())\\n```\\n\\nThis will print the first 5 rows of the dataframe, the summary of the dataframe (including the index dtype and column count), and the description of the dataframe (including count, mean, std, min, 25%, 50%, 75%, max for each column).', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert 'date' column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot the time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "This",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " code will create a line plot of",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the average yearly inflation over time. The x-axis",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " represents the year and the y-axis represents the average",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " inflation. The plot also includes a title, labels for the x",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " and y axes, and a grid for",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " better visibility.\n\nPlease note that you need",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " to replace 'inflation.csv' with the actual path to your",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " csv file. Also, this code assumes that the 'date",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "' column in your csv file is in a format that can be",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " parsed by pandas' `to_datetime` function. If your date",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " column is in a different format, you may need to specify the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " format when calling `to_datetime`.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\"), CompletionMessage(role='assistant', content='It seems that the file \"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a remote server or a local machine, you can use the `pd.read_csv()` function to load the csv file. \\n\\nHere is an example:\\n\\n```python\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv(\\'inflation.csv\\')\\n# Print the first 5 rows of the dataframe\\nprint(df.head())\\n# Print the summary of the dataframe\\nprint(df.info())\\nprint(df.describe())\\n```\\n\\nThis will print the first 5 rows of the dataframe, the summary of the dataframe (including the index dtype and column count), and the description of the dataframe (including count, mean, std, min, 25%, 50%, 75%, max for each column).', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
             },
-            "tool_call": "",
-            "type": "tool_call"
+            "logprobs": null,
+            "stop_reason": null
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
+          "metrics": null
+        }
       },
       {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\ndf = pd.read_csv('inflation.csv')\n\n#",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " Convert 'date' column to datetime\ndf['date']",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " = pd.to_datetime(df['date'])\n\n# Group by",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " year and calculate average inflation\naverage_in",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "flation = df.groupby(df['date'].dt.year",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": ")['inflation'].mean()\n\n# Plot the time series",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\nplt.figure(figsize=(10,6))\nplt.plot(average_in",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "flation.index, average_inflation.values, marker",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "='o')\nplt.title('Average Yearly Inflation')\n",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "plt.xlabel('Year')\nplt.ylabel('Average",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " Inflation')\nplt.grid(True)\nplt.show()",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv('inflation.csv')\n\n# Convert 'date' column to datetime\ndf['date'] = pd.to_datetime(df['date'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\n\n# Plot the time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
               },
-              "call_id": "65691869-f741-420c-bb73-23a1f8c0d82a",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
+              "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"polyjuice",
+              "type": "tool_call"
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "It",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " seems that the file \"/var/f",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "olders/cz/vyh7y1d11",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "xg881lsxsshnc5c0000gn/T/tmp8",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "d5c8spc/Q8Y9qzV",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "Xinflation.csv\" does not exist",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ". \n\nTo describe the csv file, you need to provide",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the actual file path or the file itself",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ". If you are using a remote server or a local machine,",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " you can use the `pd.read_csv()` function to load the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " csv file. \n\nHere is an example:\n\n```python\nimport",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " pandas as pd\n# Load data\ndf",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " = pd.read_csv('inflation.csv",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "')\n# Print the first 5 rows of the dataframe\nprint",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "(df.head())\n# Print the summary of the dataframe\nprint(df",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".info())\nprint(df.describe())\n```\n\nThis will print the first",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " 5 rows of the dataframe,",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the summary of the dataframe (including the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " index dtype and column count), and the description of the dataframe",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " (including count, mean, std,",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " min, 25%, 50%, 75%, max",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " for each column).",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
             },
-            "tool_call": "",
-            "type": "tool_call"
+            "logprobs": null,
+            "stop_reason": null
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
+          "metrics": null
+        }
       },
       {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "import pandas as pd\n# Load data",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\ndf =",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " pd.read_csv(\"/var/folders/cz/vyh7",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "y1d11xg881lsx",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "sshnc5c0000gn/T",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "/tmp8d5c8spc",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "/Q8Y9qzVXinflation.csv\")\n",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "# Rows\nprint(\"Number of rows and columns in the data",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": ":\", df.shape)\n# Columns\nprint(\"Columns of the data",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " are:\", len(df.columns))\n# Column names\nprint",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "(\"Columns of the data are:\", df.columns)\n",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "# Column dtypes\nprint(\"Datatype of the columns are",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": ":\", df.dtypes)",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp8d5c8spc/Q8Y9qzVXinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
               },
-              "call_id": "15893b4c-5a55-4ea7-9902-8a2f28fa3659",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
+              "tool_call": "\", \"celcius\": \"true\"}}",
+              "type": "tool_call"
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:255c3\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:14b97\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:3b16c\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:14b97\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:3b16c\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'How to use LoRA in Torchtune'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:14b97\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:14b97\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:14b97\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:14b97\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune\\'s LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we\\'ve loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\"\"\"\\n    {total_params} total params,\\n    {trainable_params}\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \"\"\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:14b97\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet\\'s run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=[\\'q_proj\\',\\'k_proj\\',\\'v_proj\\',\\'output_proj\\'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune\\'s :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA\\'s low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "To",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " use LoRA in Torchtune, you can follow these",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " steps:\n\n1.  Install Torchtune and its dependencies",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".\n2.  Download the Llama",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "2 weights and tokenizer.\n3.  Use the `l",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "ora_llama2_7b` model in Torchtune",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ", which applies LoRA to the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Q and V projections by default.\n4.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "  Set the `lora_attn_modules` argument to",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " apply LoRA to all linear",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " layers in the self-attention.\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "5.  Increase the rank and",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " alpha values to experiment with different LoRA",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " configurations.\n6.  Run the LoRA finetuning",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " recipe in Torchtune using the `lora_finet",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "une_distributed` command.\n7.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "  Monitor the loss curves and adjust the Lo",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "RA configuration as needed to trade off memory and model performance.\n\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "By following these steps, you can effectively use LoRA in",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Torchtune to fine-tune Llama",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "2 models with a low memory footprint.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:255c3\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:14b97\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:3b16c\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:14b97\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:3b16c\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "{\"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "type\": \"function\", \"name\": \"knowledge_search\", \"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "parameters\": {\"query\": \"How to use LoRA in",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Torchtune\"}}",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
             },
-            "tool_call": {
-              "arguments": {
-                "query": "How to use LoRA in Torchtune"
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
               },
-              "call_id": "41f1d05b-cfca-4d54-a0de-38a968017c8b",
-              "tool_name": "knowledge_search"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:255c3\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:14b97\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:3b16c\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:14b97\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:3b16c\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "I",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "'m ready to help you answer questions about Torchtune based",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " on the documentation you provided. What's your first question?",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:292ee\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:47152\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:2513e\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:47152\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:2513e\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'How to use LoRA in Torchtune'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:47152\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:47152\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:47152\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:47152\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune\\'s LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we\\'ve loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\"\"\"\\n    {total_params} total params,\\n    {trainable_params}\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \"\"\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:47152\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet\\'s run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=[\\'q_proj\\',\\'k_proj\\',\\'v_proj\\',\\'output_proj\\'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune\\'s :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA\\'s low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "To",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " use LoRA in Torchtune, you can follow these steps",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ":\n\n1.  Install Torchtune and its dependencies.\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "2.  Download the Llama2 weights and tokenizer.\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "3.  Use the `lora_llama2_",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "7b` model in Torchtune",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ", which applies LoRA to the Q",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " and V projections by default.\n4",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".  Load the base model weights into the LoRA model without any",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " conversion necessary.\n5.  Set only LoRA parameters to",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " trainable.\n6.  Run the LoRA finetuning recipe",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " in Torchtune with the desired configuration.\n\nYou",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " can also experiment with different LoRA configurations, such as",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " applying LoRA to all linear layers in the self",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "-attention, increasing the rank, or scaling alpha and rank",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " together.\n\nBy following these steps, you",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " can use LoRA in Torchtune to fine-tune a",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Llama2 model with a low memory footprint and achieve good",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " performance.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:292ee\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:47152\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:2513e\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:47152\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:2513e\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "{\"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "type\": \"function\", \"name\":",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " \"knowledge_search\", \"parameters\": {\"query\": \"How to",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " use LoRA in Torchtune\"}}",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "How to use LoRA in Torchtune"
+              "tool_call": {
+                "arguments": {
+                  "celcius": "true",
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "6dd93d40-18ea-40c1-9e4d-78b3bd865e67",
+                "tool_name": "get_boiling_point"
               },
-              "call_id": "5beb7c24-953b-4ad7-b834-a26522fb5ac7",
-              "tool_name": "knowledge_search"
+              "type": "tool_call"
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:292ee\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:47152\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:2513e\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:47152\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:2513e\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "I",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "'m ready to help you answer questions about Torchtune based",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " on the documentation you provided. What's your first question",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "?",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:ab1b9\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:cc646\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:8bcf6\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:cc646\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:8bcf6\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'How to use LoRA in Torchtune'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:cc646\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:cc646\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:cc646\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:cc646\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune\\'s LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we\\'ve loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\"\"\"\\n    {total_params} total params,\\n    {trainable_params}\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \"\"\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:cc646\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet\\'s run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=[\\'q_proj\\',\\'k_proj\\',\\'v_proj\\',\\'output_proj\\'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune\\'s :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA\\'s low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "To",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " use LoRA in Torchtune, you",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " can follow these steps:\n\n1.  Install Torchtune",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " and its dependencies.\n2.  Download the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Llama2 weights and tokenizer.\n3",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".  Use the `lora_llama",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "2_7b` model in Torchtune, which",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " applies LoRA to the Q and V",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " projections by default.\n4.  Load the base model weights into",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the LoRA model without any conversion necessary.\n5.  Set",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " only LoRA parameters to trainable.\n6.  Run the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " LoRA finetuning recipe in Torchtune with the desired",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " configuration.\n\nYou can also experiment with different LoRA configurations, such",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " as applying LoRA to all linear layers in the self-attention",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ", increasing the rank, or scaling alpha and rank together.\n\nBy",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " following these steps, you can use LoRA in Torchtune",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " to fine-tune a Llama2",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " model with parameter-efficient finetuning and memory savings.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:ab1b9\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:cc646\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:8bcf6\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:cc646\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:8bcf6\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "{\"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "type\": \"function\", \"name\":",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " \"knowledge_search\", \"parameters",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "\": {\"query\": \"How to use LoRA in Tor",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "chtune\"}}",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
             },
-            "tool_call": {
-              "arguments": {
-                "query": "How to use LoRA in Torchtune"
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
               },
-              "call_id": "5af3ef1f-98c0-4c60-9b8b-892b5e921040",
-              "tool_name": "knowledge_search"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:ab1b9\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:cc646\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:8bcf6\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:cc646\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:8bcf6\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "I",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "'m ready to help you answer questions about Torchtune based on",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the documentation you provided. What's your first question?",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:c4b2d\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:606ad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:e37c3\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:606ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:e37c3\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='You can use the following function call to answer the user\\'s question:\\n\\n{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters\": {\"query\": \"How to fine-tune a Llama2 model with LoRA in torchtune\"}}', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'How to use LoRA'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:606ad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:606ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:e37c3\\nContent:  with training with LoRA quickly,\\njust specify any config with ``_lora`` in its name, e.g:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n\\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\\nwhich linear layers LoRA should be applied to in the model:\\n\\n* ``lora_attn_modules: List[str]`` accepts a list of strings specifying which layers of the model to apply\\n  LoRA to:\\n\\n  * ``q_proj`` applies LoRA to the query projection layer.\\n  * ``k_proj`` applies LoRA to the key projection layer.\\n  * ``v_proj`` applies LoRA to the value projection layer.\\n  * ``output_proj`` applies LoRA to the attention output projection layer.\\n\\n  Whilst adding more layers to be fine-tuned may improve model accuracy,\\n  this will come at the cost of increased memory usage and reduced training speed.\\n\\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\\n* ``apply_lora_to_output: Bool`` applies LoRA to the model\\'s final output projection.\\n  This is usually a projection to vocabulary space (e.g. in language models), but\\n  other modelling tasks may have different projections - classifier models will project\\n  to the number of classes, for example\\n\\n.. note::\\n\\n  Models which use tied embeddings (such as Gemma and Qwen2 1.5B and 0.5B) for the\\n  final output projection do not support ``apply_lora_to_output``.\\n\\nThese are all specified under the ``model`` flag or config entry, i.e:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"]\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.llama3.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    model.lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\\n\\nSecondly, parameters which control the scale of the impact of LoRA on the model:\\n\\n* ``lora_rank: int`` affects the scale of\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:606ad\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:0b7ba\\nContent: ora_finetune_label>`.\\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial <qlora_finetune_label>`.\\n\\nLet\\'s take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n.. note::\\n    To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\\n\\nWe can also add :ref:`command-line overrides <cli_override>` as needed, e.g.\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n        checkpointer.checkpoint_dir=<checkpoint_dir> \\\\\\n        tokenizer.path=<checkpoint_dir>/tokenizer.model \\\\\\n        checkpointer.output_dir=<checkpoint_dir>\\n\\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ``<checkpoint_dir>`` used in the :ref:`tune download <tune_download_label>` command above,\\nthen save a final checkpoint in the same directory following the original format. For more details on the\\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive <understand_checkpointer>`.\\n\\n.. note::\\n    To see the full set of configurable parameters for this (and other) configs we can use :ref:`tune cp <tune_cp_cli_label>` to copy (and modify)\\n    the default config. :ref:`tune cp <tune_cp_cli_label>` can be used with recipe scripts too, in case you want to make more custom changes\\n    that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp <tune_cp_cli_label>` see the section on\\n    :ref:`modifying configs <tune_cp_label>` in our \":ref:`finetune_llama_label`\" tutorial.\\n\\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights\\nwill\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "To",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " use LoRA, you can follow these steps",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ":\n\n1.  Install the necessary packages, including torchtune",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " and the Llama2 model.\n2.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "  Load the Llama2 model and specify which layers to",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " apply LoRA to.\n3.  Define the LoRA parameters",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ", such as the rank and alpha values.\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "4.  Train the model using the LoRA fine-tuning",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " recipe in torchtune.\n5.  Use the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " trained model for inference or further fine-tuning.\n\nHere is an",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " example of how to use LoRA with the Llama2 model",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ":\n\n```python\nfrom torchtune.models.llama2 import",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " llama2_7b, lora_llama2_7",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "b\n\n# Build Llama2 without any LoRA layers\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "base_model = llama2_7b()\n\n# The default settings for",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " lora_llama2_7b will match",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " those for llama2_7b\n# We just need to define",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " which layers we want LoRA applied to.\n# Within each self-",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "attention, we can choose from [\"q_proj\", \"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "k_proj\", \"v_proj\", and \"output_proj\"].\n# We",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " can also set apply_lora_to_mlp=True or apply_lora",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "_to_output=True to apply LoRA to other linear\n# layers outside",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " of the self-attention.\nlora_model = lora_llama",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "2_7b(lora_attn_modules=[\"q_proj\", \"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "v_proj\"])\n\n# Print the first layer's self-attention in the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " usual Llama2 model\nprint(base_model.layers[0].at",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "tn)\n# Print the same for Llama2 with LoRA weights",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "\nprint(lora_model.layers[0].attn)\n```\n\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "This code loads the Llama2 model and applies LoRA to the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " query and value projection layers. You can modify the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " `",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "lora_attn_modules` parameter to apply LoRA to",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " different layers.\n\nTo train the model using the LoRA fine",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "-tuning recipe in torchtune, you can use the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " following command:\n\n```bash\ntune run l",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "ora_finetune_single_device --config llama3/",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "8B_lora_single_device\n```\n\nThis will train the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " model for one epoch on a common instruct",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " dataset. You can modify the command to change the training settings",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ", such as the number of epochs or the batch size.\n\nAfter",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " training, you can use the trained model for inference or further fine",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "-tuning. You can load the model using the `",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "load_checkpoint` method and use it to make predictions or continue",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " training.\n\n```",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "out_of_tokens"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:c4b2d\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:606ad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:e37c3\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:606ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:e37c3\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='You can use the following function call to answer the user\\'s question:\\n\\n{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters\": {\"query\": \"How to fine-tune a Llama2 model with LoRA in torchtune\"}}', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "{\"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "\": {\"query\": \"How to use LoRA\"}}",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "How to use LoRA"
+              "metric": "prompt_tokens",
+              "span_id": "tBuntiC1",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:54.993737+00:00",
+                "__module__": "datetime"
               },
-              "call_id": "8b617e66-08b4-4e93-8219-29b8b84c4672",
-              "tool_name": "knowledge_search"
+              "trace_id": "5SueXj79Q2e5n37g",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 43
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:c4b2d\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:606ad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:e37c3\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:606ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:e37c3\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "You",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " can use the following function call to answer the user's question:\n\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "{\"type\": \"function\", \"name\": \"knowledge_search\",",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " \"parameters\": {\"query\": \"How to fine-tune a",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Llama2 model with LoRA in torchtune\"}}",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:f3963\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:0484f\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:e075f\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:0484f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:e075f\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'How to use LoRA in Torchtune'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:0484f\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:0484f\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:0484f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:0484f\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune\\'s LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we\\'ve loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\"\"\"\\n    {total_params} total params,\\n    {trainable_params}\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \"\"\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:0484f\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet\\'s run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=[\\'q_proj\\',\\'k_proj\\',\\'v_proj\\',\\'output_proj\\'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune\\'s :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA\\'s low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "To",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " use LoRA in Torchtune, you can follow these steps",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ":\n\n1.  Install Torchtune and its dependencies.\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "2.  Download the Llama2 weights and tokenizer.\n3",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".  Use",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the `lora_llama2_7",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "b` model in Torchtune, which applies",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " LoRA to the Q and V projections by default",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".\n4.  Load the base model weights into the LoRA",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " model without any conversion necessary.\n5.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "  Set only LoRA parameters",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " to trainable.\n6.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "  Run the LoRA fin",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "etuning recipe in Torcht",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "une with the desired configuration.\n\nYou can also experiment",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " with different Lo",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "RA configurations, such as applying LoRA to",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " all linear layers in the self-attention, increasing the rank,",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " or scaling alpha and rank together.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:f3963\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:0484f\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:e075f\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:0484f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:e075f\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "{\"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "type\": \"function\", \"name",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "\": \"knowledge_search\", \"parameters\":",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " {\"query\": \"How to use Lo",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "RA in Torchtune\"}}",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "How to use LoRA in Torchtune"
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
               },
-              "call_id": "42e1de09-f47e-44b0-9331-9b878556970d",
-              "tool_name": "knowledge_search"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:f3963\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:0484f\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:e075f\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:0484f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:e075f\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "I",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "'m ready to help you answer questions about",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Torchtune based on the documentation you",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " provided. What's your first question?",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:f4fd3\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:cbc88\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:8892b\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:cbc88\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:8892b\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='You can use the following function call to answer the user\\'s question:\\n\\n{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters\": {\"query\": \"How to fine-tune a Llama2 model with LoRA in torchtune\"}}', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'How to use LoRA'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:cbc88\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:cbc88\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:8892b\\nContent:  with training with LoRA quickly,\\njust specify any config with ``_lora`` in its name, e.g:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n\\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\\nwhich linear layers LoRA should be applied to in the model:\\n\\n* ``lora_attn_modules: List[str]`` accepts a list of strings specifying which layers of the model to apply\\n  LoRA to:\\n\\n  * ``q_proj`` applies LoRA to the query projection layer.\\n  * ``k_proj`` applies LoRA to the key projection layer.\\n  * ``v_proj`` applies LoRA to the value projection layer.\\n  * ``output_proj`` applies LoRA to the attention output projection layer.\\n\\n  Whilst adding more layers to be fine-tuned may improve model accuracy,\\n  this will come at the cost of increased memory usage and reduced training speed.\\n\\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\\n* ``apply_lora_to_output: Bool`` applies LoRA to the model\\'s final output projection.\\n  This is usually a projection to vocabulary space (e.g. in language models), but\\n  other modelling tasks may have different projections - classifier models will project\\n  to the number of classes, for example\\n\\n.. note::\\n\\n  Models which use tied embeddings (such as Gemma and Qwen2 1.5B and 0.5B) for the\\n  final output projection do not support ``apply_lora_to_output``.\\n\\nThese are all specified under the ``model`` flag or config entry, i.e:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"]\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.llama3.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    model.lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\\n\\nSecondly, parameters which control the scale of the impact of LoRA on the model:\\n\\n* ``lora_rank: int`` affects the scale of\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:cbc88\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:9dcb7\\nContent: ora_finetune_label>`.\\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial <qlora_finetune_label>`.\\n\\nLet\\'s take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device\\n\\n.. note::\\n    To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\\n\\nWe can also add :ref:`command-line overrides <cli_override>` as needed, e.g.\\n\\n.. code-block:: bash\\n\\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n        checkpointer.checkpoint_dir=<checkpoint_dir> \\\\\\n        tokenizer.path=<checkpoint_dir>/tokenizer.model \\\\\\n        checkpointer.output_dir=<checkpoint_dir>\\n\\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ``<checkpoint_dir>`` used in the :ref:`tune download <tune_download_label>` command above,\\nthen save a final checkpoint in the same directory following the original format. For more details on the\\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive <understand_checkpointer>`.\\n\\n.. note::\\n    To see the full set of configurable parameters for this (and other) configs we can use :ref:`tune cp <tune_cp_cli_label>` to copy (and modify)\\n    the default config. :ref:`tune cp <tune_cp_cli_label>` can be used with recipe scripts too, in case you want to make more custom changes\\n    that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp <tune_cp_cli_label>` see the section on\\n    :ref:`modifying configs <tune_cp_label>` in our \":ref:`finetune_llama_label`\" tutorial.\\n\\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights\\nwill\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "To",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " use LoRA, you can follow these steps:\n\n1.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "  Install the necessary packages, including torchtune and the L",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "lama2 model.\n2.  Load the Llama2 model",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " and specify which layers to apply LoRA to.\n3. ",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Define the LoRA parameters, such as the rank and alpha values",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".\n4.  Train the model using the LoRA fine-t",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "uning recipe in torchtune.\n\nHere is",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " an example of how to use",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " LoRA with the Llama2 model",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ":\n\n```python\nfrom torchtune.models.llama2 import",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " llama2_7b, lora_llama",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "2_7b\n\n# Build Llama",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "2 without any LoRA layers\nbase_model = llama2_",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "7b()\n\n# The default settings for lora_llama2",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "_7b will match those for llama2_7",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "b\n# We just need to define which layers we want",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " LoRA applied to.\n# Within each self",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "-attention, we can choose from [\"q_proj\", \"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "k_proj\", \"v_proj\", and \"output_proj",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "\"].\n# We can also set apply_lora_to_m",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "lp=True or apply_lora_to_output=True",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " to apply LoRA to other",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " linear\n# layers outside of the self-attention.\nl",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "ora_model = lora_llama",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "2_7b(lora_at",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "tn_modules=[\"q_proj\", \"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "v_proj\"])\n\n# Print the first",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " layer's self-attention in the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " usual Llama2 model\nprint",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "(base_model.layers[0].attn)\n# Print the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " same for Llama2 with LoRA",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " weights\nprint(lora_model.layers[0].attn",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ")\n```\n\nThis code will load the Llama2 model",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " and apply LoRA to the specified layers. The `l",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "ora_attn_modules` parameter is used to specify which layers",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " to apply LoRA to, and the `apply_lora",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "_to_mlp` and `apply_lora_to_output`",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " parameters can be used to apply LoRA",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " to other linear layers outside of the self",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "-attention.\n\nYou can also use the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " `tune run` command to fine-tune the model",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " using the LoRA fine-tuning recipe in torchtune.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " For example:\n\n```bash\ntune",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " run lora_finetune_single_device --config llama3",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "/8B_lora_single_device\n``",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "`\n\nThis will run the LoRA fine-tuning recipe on the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Llama3-8B-Instruct model using the default configuration",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ". You can modify the configuration by adding command-line overrides, such",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " as:\n\n```bash\ntune run",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "out_of_tokens"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:f4fd3\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:cbc88\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:8892b\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:cbc88\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:8892b\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='You can use the following function call to answer the user\\'s question:\\n\\n{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters\": {\"query\": \"How to fine-tune a Llama2 model with LoRA in torchtune\"}}', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "{\"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "type\": \"function\", \"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "name\": \"knowledge_search\", \"parameters\":",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " {\"query\": \"How to use LoRA\"}}",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "How to use LoRA"
+              "metric": "completion_tokens",
+              "span_id": "tBuntiC1",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:54.993758+00:00",
+                "__module__": "datetime"
               },
-              "call_id": "64448cc3-c11a-4bae-bdcc-e5b8d13b888f",
-              "tool_name": "knowledge_search"
+              "trace_id": "5SueXj79Q2e5n37g",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:f4fd3\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:cbc88\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:8892b\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:cbc88\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:8892b\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "You",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " can use the following function call to answer",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the user's question:\n\n{\"type\": \"function\", \"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "name\": \"knowledge_search\", \"parameters\":",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " {\"query\": \"How to fine-tune a Llama2",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " model with LoRA in torchtune\"}}",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\", \"parameters\": {\"query\": \"Torchtune",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " documentation\"}}",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "Torchtune documentation"
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
               },
-              "call_id": "0f0eb27a-1126-4d26-8b33-b630a9518093",
-              "tool_name": "knowledge_search"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Instead of the standard multi-head attention, what attention type does Llama3-8B use?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Llama3-8B attention type'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:num-1\\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:num-1\\nContent:  instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\\n\\n\\n.. code-block:: bash\\n\\n    tune download meta-llama/Meta-Llama-3\\n\"), TextContentItem(type='text', text=\"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights <llama3_label>`\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n    <s>[INST] <<SYS>>\\n    You are a helpful, respectful, and honest assistant.\\n    <</SYS>>\\n\\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\\n\\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\\n\"), TextContentItem(type='text', text='Result 4:\\nDocument_id:num-0\\nContent: \\'m Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let\\'s walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n    The Llama3 Base model uses a `different prompt template\\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n    Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet\\'s say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n    sample = [\\n        {\\n            \"role\": \"system\",\\n            \"\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:num-3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='insert_into_memory', description='Insert documents into memory', parameters={}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " attention type used by Llama3-8B is grouped",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "-query attention.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Instead of the standard multi-head attention, what attention type does Llama3-8B use?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Llama3-8B attention type'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:num-1\\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:num-1\\nContent:  instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\\n\\n\\n.. code-block:: bash\\n\\n    tune download meta-llama/Meta-Llama-3\\n\"), TextContentItem(type='text', text=\"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights <llama3_label>`\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n    <s>[INST] <<SYS>>\\n    You are a helpful, respectful, and honest assistant.\\n    <</SYS>>\\n\\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\\n\\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\\n\"), TextContentItem(type='text', text='Result 4:\\nDocument_id:num-0\\nContent: \\'m Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let\\'s walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n    The Llama3 Base model uses a `different prompt template\\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n    Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet\\'s say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n    sample = [\\n        {\\n            \"role\": \"system\",\\n            \"\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:num-3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " attention type used by Llama3-",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "8B is grouped-query attention.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Instead of the standard multi-head attention, what attention type does Llama3-8B use?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='insert_into_memory', description='Insert documents into memory', parameters={}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "{\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "    \"type\": \"function\",\n    \"name\": \"knowledge",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "_search\",\n    \"parameters\": {\n        \"query\": \"L",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "lama3-8B attention type\"\n    }\n}",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "Llama3-8B attention type"
+              "metric": "total_tokens",
+              "span_id": "tBuntiC1",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:54.993761+00:00",
+                "__module__": "datetime"
               },
-              "call_id": "ce62cb6d-fcb0-437a-abd9-b0bed88628ed",
-              "tool_name": "knowledge_search"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
+              "trace_id": "5SueXj79Q2e5n37g",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 53
+            }
+          ]
+        }
       }
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Instead of the standard multi-head attention, what attention type does Llama3-8B use?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}": {
     "chunks": [
       {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
+          "metrics": null
+        }
       },
       {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
             },
-            "tool_call": "",
-            "type": "tool_call"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
+          "metrics": null
+        }
       },
       {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " boiling point of polyjuice is -100\u00b0C.",
+              "type": "text"
             },
-            "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",",
-            "type": "tool_call"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
+          "metrics": null
+        }
       },
       {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
             },
-            "tool_call": " \"parameters\": {\"query\": \"L",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
             },
-            "tool_call": "lama3-8B attention type\"}}",
-            "type": "tool_call"
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "Llama3-8B attention type"
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
               },
-              "call_id": "25fcc4f2-72a8-4175-82ca-c7a692d13d66",
-              "tool_name": "knowledge_search"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Search the web and tell me who the current CEO of Meta is.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.brave_search: 'brave_search'>, arguments={'query': 'current CEO of Meta'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.brave_search: 'brave_search'>, content='{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\", \"score\": 0.8190992, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\", \"score\": 0.79099923, \"raw_content\": null}, {\"title\": \"Meet the Executive CSuite Team of Meta (Facebook) [2025]\", \"url\": \"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\", \"content\": \"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\", \"score\": 0.7602419, \"raw_content\": null}, {\"title\": \"Meta to spend up to $65 billion this year to power AI goals, Zuckerberg ...\", \"url\": \"https://www.reuters.com/technology/meta-invest-up-65-bln-capital-expenditure-this-year-2025-01-24/\", \"content\": \"Meta Platforms plans to spend as much as $65 billion this year to expand its AI infrastructure, CEO Mark Zuckerberg said on Friday, aiming to bolster the company\\'s position against rivals OpenAI\", \"score\": 0.73914057, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Forbes\", \"url\": \"https://www.forbes.com/profile/mark-zuckerberg/\", \"content\": \"Meta CEO Mark Zuckerberg \\\\u201cloved\\\\u201d an image on Facebook known as \\\\\"Challah Horse\\\\\" that happens to be AI-generated, highlighting the amount of AI spam on the platform. ### Meta Donates $1 Million To Trump\\\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President Elect Meta has donated $1 million to President-elect Donald Trump\\\\u2019s inaugural fund, the company confirmed to various news outlets on Wednesday, a move that comes just weeks after its CEO Mark Zuckerberg met with Trump at his Mar-a-Lago residence in an apparent bid to mend years of strained ties. ### Meta Donates $1 Million To Trump\\\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President-Elect Read the full profile on Forbes: https://www.forbes.com/sites/kerryadolan/2023/09/26/mark-gets-meta-zuckerberg-talks-ai-and-that-musk-mma-fight-thats-never-going-to-happen/?sh=671046e73037\", \"score\": 0.6410185, \"raw_content\": null}]}')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.brave_search: 'brave_search'>, description='Search the web for information', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " current CEO of Meta is Mark Zuckerberg.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Search the web and tell me who the current CEO of Meta is.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.brave_search: 'brave_search'>, arguments={'query': 'current CEO of Meta'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.brave_search: 'brave_search'>, content='{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\", \"score\": 0.8190992, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\", \"score\": 0.79099923, \"raw_content\": null}, {\"title\": \"Meet the Executive CSuite Team of Meta (Facebook) [2025]\", \"url\": \"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\", \"content\": \"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\", \"score\": 0.7602419, \"raw_content\": null}, {\"title\": \"Meta to spend up to $65 billion this year to power AI goals, Zuckerberg ...\", \"url\": \"https://www.reuters.com/technology/meta-invest-up-65-bln-capital-expenditure-this-year-2025-01-24/\", \"content\": \"Meta Platforms plans to spend as much as $65 billion this year to expand its AI infrastructure, CEO Mark Zuckerberg said on Friday, aiming to bolster the company\\'s position against rivals OpenAI\", \"score\": 0.73914057, \"raw_content\": null}, {\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\", \"score\": 0.6175132, \"raw_content\": null}]}')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.brave_search: 'brave_search'>, description='Search the web for information', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " current CEO of Meta is Mark Zuckerberg",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Search the web and tell me who the current CEO of Meta is.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.brave_search: 'brave_search'>, arguments={'query': 'current CEO of Meta'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.brave_search: 'brave_search'>, content='{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\", \"score\": 0.8190992, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\", \"score\": 0.79099923, \"raw_content\": null}, {\"title\": \"Zuckerberg\\'s political pivot targets Apple, puts Meta staffers on edge\", \"url\": \"https://www.cnbc.com/2025/02/14/zuckerbergs-rightward-policy-shift-hits-meta-staffers-targets-apple.html\", \"content\": \"Meta CEO Mark Zuckerberg\\'s actions to curry favor with the president have rattled employees, but people familiar with his efforts say there\\'s a clear strategy.\", \"score\": 0.77179235, \"raw_content\": null}, {\"title\": \"Meet the Executive CSuite Team of Meta (Facebook) [2025]\", \"url\": \"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\", \"content\": \"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\", \"score\": 0.7602419, \"raw_content\": null}, {\"title\": \"Meta to spend up to $65 billion this year to power AI goals, Zuckerberg ...\", \"url\": \"https://www.reuters.com/technology/meta-invest-up-65-bln-capital-expenditure-this-year-2025-01-24/\", \"content\": \"Meta Platforms plans to spend as much as $65 billion this year to expand its AI infrastructure, CEO Mark Zuckerberg said on Friday, aiming to bolster the company\\'s position against rivals OpenAI\", \"score\": 0.73914057, \"raw_content\": null}]}')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.brave_search: 'brave_search'>, description='Search the web for information', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " current CEO of Meta is Mark Zuckerberg.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Search the web and tell me who the current CEO of Meta is.', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.brave_search: 'brave_search'>, description='Search the web for information', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "brave_search.call(query=\"current",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " CEO of Meta\")",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "current CEO of Meta"
+              "metric": "prompt_tokens",
+              "span_id": "03QQgo3b",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:34.636678+00:00",
+                "__module__": "datetime"
               },
-              "call_id": "f5d644f1-3ada-4a5a-a088-736c89428fe9",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "brave_search"
-              }
+              "trace_id": "mE4SuRfcQUOcOyP2",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 85
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='What is the boiling point of polyjuice?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='get_boiling_point', arguments={'liquid_name': 'polyjuice'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='get_boiling_point', content='-100')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice='get_boiling_point', tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='str', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " function `get_boiling_point`",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " is not able to find the boiling point of polyjuice as",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " it is a fictional liquid from the Harry Potter series.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='What is the boiling point of polyjuice?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='get_boiling_point', arguments={'liquid_name': 'polyjuice'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='get_boiling_point', content='-100')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice='get_boiling_point', tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " function `get_boiling_point` is",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " not able to find the boiling point of",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " polyjuice as it is a fictional",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " liquid from the Harry Potter series. The function is only able",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " to find the boiling point of real liquids.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='What is the boiling point of polyjuice?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='get_boiling_point', arguments={'liquid_name': 'polyjuice'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='get_boiling_point', content='-100')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='str', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)}), ToolDefinition(tool_name=<BuiltinTool.brave_search: 'brave_search'>, description='Search the web for information', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " function `get_boiling_point` is not",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " able to find the boiling point of poly",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "juice as it is not a real liquid. Polyjuice",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " is a magical potion from the Harry Potter",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " series.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='What is the boiling point of polyjuice?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='get_boiling_point', arguments={'liquid_name': 'polyjuice'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='get_boiling_point', content='-100')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)}), ToolDefinition(tool_name=<BuiltinTool.brave_search: 'brave_search'>, description='Search the web for information', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " function `get_boiling_point`",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " is not able to find the boiling point of",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " polyjuice as it is not a",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " real liquid.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='What is the boiling point of polyjuice?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='get_boiling_point', arguments={'liquid_name': 'polyjuice'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='get_boiling_point', content='-100')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.required: 'required'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='str', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " function `get_boiling_point` is not",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " able to find the boiling point of polyjuice as it is",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " not a real liquid. Polyjuice is",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " a magical potion from the Harry Potter series.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='What is the boiling point of polyjuice?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='get_boiling_point', arguments={'liquid_name': 'polyjuice'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='get_boiling_point', content='-100')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.required: 'required'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " function `get_boiling_point` is not able",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " to find the boiling point of polyju",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "ice as it is not a real",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " liquid.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='What is the boiling point of polyjuice?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice='get_boiling_point', tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='str', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "{\"type\": \"function\", \"name\": \"get_bo",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "iling_point\", \"parameters\": {\"liquid",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "_name\": \"polyjuice\"}}",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "liquid_name": "polyjuice"
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
               },
-              "call_id": "490c45b2-2a13-4ee1-9e37-711fabdbcc88",
-              "tool_name": "get_boiling_point"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='What is the boiling point of polyjuice?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice='get_boiling_point', tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "{\"type\": \"function",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\", \"name\": \"get_boiling_point\",",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " \"parameters\": {\"liquid_name\": \"",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "polyjuice\"}}",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "liquid_name": "polyjuice"
+              "metric": "completion_tokens",
+              "span_id": "03QQgo3b",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:34.636767+00:00",
+                "__module__": "datetime"
               },
-              "call_id": "22050f4b-36df-48fb-ac11-e3a47fa0beaf",
-              "tool_name": "get_boiling_point"
+              "trace_id": "mE4SuRfcQUOcOyP2",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 22
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='What is the boiling point of polyjuice?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='str', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)}), ToolDefinition(tool_name=<BuiltinTool.brave_search: 'brave_search'>, description='Search the web for information', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "{\"type\": \"function\", \"name\": \"get",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "_boiling_point\", \"parameters\": {\"liquid_name\": \"polyjuice",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\"}}",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "liquid_name": "polyjuice"
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
               },
-              "call_id": "b5f6f475-f1ed-4916-9959-405e72ca0c1d",
-              "tool_name": "get_boiling_point"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='What is the boiling point of polyjuice?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)}), ToolDefinition(tool_name=<BuiltinTool.brave_search: 'brave_search'>, description='Search the web for information', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "{\"type\": \"function\", \"name",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\": \"get_boiling_point\", \"parameters",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\": {\"liquid_name\": \"polyjuice\"}}",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "liquid_name": "polyjuice"
+              "metric": "total_tokens",
+              "span_id": "03QQgo3b",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:34.636773+00:00",
+                "__module__": "datetime"
               },
-              "call_id": "11302682-7a3a-45f3-955b-6709444fd626",
-              "tool_name": "get_boiling_point"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
+              "trace_id": "mE4SuRfcQUOcOyP2",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 107
+            }
+          ]
+        }
       }
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='What is the boiling point of polyjuice?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.none: 'none'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='str', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}": {
     "chunks": [
       {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "I",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " couldn't find any information on the boiling point of Polyjuice",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ". Polyjuice is a magical potion in",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the Harry Potter series that allows the drinker to transform into",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " someone else. It's not a physical substance with a boiling point",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ". If you have any other questions, I'd be happy to",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " help.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='What is the boiling point of polyjuice?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.none: 'none'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "I",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " couldn't find any information on the boiling point",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " of Polyjuice. Polyjuice is a magical potion in the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Harry Potter series that allows the drinker to transform into someone else. It's",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " not a physical substance with a boiling point. If",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " you have any other questions, I'd be",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " happy to help.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='What is the boiling point of polyjuice?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.required: 'required'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='str', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
             },
-            "tool_call": "",
-            "type": "tool_call"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
+          "metrics": null
+        }
       },
       {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
             },
-            "tool_call": "{\"type\": \"function\", \"name\": \"",
-            "type": "tool_call"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
+          "metrics": null
+        }
       },
       {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " boiling point of polyjuice is -100\u00b0C.",
+              "type": "text"
             },
-            "tool_call": "get_boiling_point\", \"parameters\": {\"liquid_name\": \"",
-            "type": "tool_call"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
+          "metrics": null
+        }
       },
       {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
             },
-            "tool_call": "polyjuice\"}}",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
             },
-            "tool_call": {
-              "arguments": {
-                "liquid_name": "polyjuice"
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
               },
-              "call_id": "3e1a2cdc-46c3-4f2f-9fca-874fdea1700c",
-              "tool_name": "get_boiling_point"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='What is the boiling point of polyjuice?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.required: 'required'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "_point\", \"parameters\": {\"liquid_name",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\": \"polyjuice\"}}",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "liquid_name": "polyjuice"
+              "metric": "prompt_tokens",
+              "span_id": "vzNuoz4e",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:45.792508+00:00",
+                "__module__": "datetime"
               },
-              "call_id": "e704d0f9-45a1-4ed1-90b0-8a05c504da6c",
-              "tool_name": "get_boiling_point"
+              "trace_id": "vNRMmadcTVmfkn5-",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 87
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Write code and execute it to find the answer for: What is the 100th prime number?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'def is_prime(n):\\n    if n <= 1:\\n        return False\\n    if n <= 3:\\n        return True\\n    if n % 2 == 0 or n % 3 == 0:\\n        return False\\n    i = 5\\n    while i * i <= n:\\n        if n % i == 0 or n % (i + 2) == 0:\\n            return False\\n        i += 6\\n    return True\\n\\ndef get_nth_prime(n):\\n    count = 0\\n    num = 2\\n    while True:\\n        if is_prime(num):\\n            count += 1\\n            if count == n:\\n                return num\\n        num += 1\\n\\nprint(get_nth_prime(100))'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " 100th prime number is 541.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Write code and execute it to find the answer for: What is the 100th prime number?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "def is_prime(n):\n    if n",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " <= 1:\n        return False\n    if n <= 3:\n        return",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " True\n    if n % 2 == 0 or n",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " % 3 == 0:\n       ",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " return False\n    i = 5\n    while i *",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " i <= n:\n        if n % i",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " == 0 or n % (i + 2",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": ") == 0:\n            return False",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\n        i += 6\n   ",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " return True\n\ndef get_nth_prime(n):\n    count = ",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "0\n    num = 2\n   ",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " while True:\n        if is_prime(num):\n            count += ",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "1\n            if count == n:\n                return num\n       ",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " num += 1\n\nprint(get_nth_prime(100))",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "def is_prime(n):\n    if n <= 1:\n        return False\n    if n <= 3:\n        return True\n    if n % 2 == 0 or n % 3 == 0:\n        return False\n    i = 5\n    while i * i <= n:\n        if n % i == 0 or n % (i + 2) == 0:\n            return False\n        i += 6\n    return True\n\ndef get_nth_prime(n):\n    count = 0\n    num = 2\n    while True:\n        if is_prime(num):\n            count += 1\n            if count == n:\n                return num\n        num += 1\n\nprint(get_nth_prime(100))"
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
               },
-              "call_id": "6d57c323-7679-447f-9928-ccab76c0bdc9",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was Perplexity the company founded?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Perplexity company founding date'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Perplexity company founding date'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "Per",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "plexity the company was founded in 202",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "2.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was Perplexity the company founded?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Perplexity company founding date'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Perplexity company founding date'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "Per",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "plexity the company was founded in 2022.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was Perplexity the company founded?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Perplexity company founding date'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "{\"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "type\": \"function\", \"name\": \"knowledge_search\", \"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "parameters\": {\"query\": \"Perplexity company founding date\"}}",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "Perplexity company founding date"
+              "metric": "completion_tokens",
+              "span_id": "vzNuoz4e",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:45.792536+00:00",
+                "__module__": "datetime"
               },
-              "call_id": "22d5440e-2873-4956-a81f-f114fc78671d",
-              "tool_name": "knowledge_search"
+              "trace_id": "vNRMmadcTVmfkn5-",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 22
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was Perplexity the company founded?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Perplexity company founding date'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "{\"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "type\": \"function\", \"name\": \"knowledge_search\",",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " \"parameters\": {\"query\":",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " \"Perplexity company founding date\"}}",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "Perplexity company founding date"
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
               },
-              "call_id": "e4a5ff1d-ac00-4e0a-b93b-17e19fa3bc55",
-              "tool_name": "knowledge_search"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was Perplexity the company founded?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "{\"type\": \"function\", \"name",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\": \"knowledge_search\", \"parameters\": {\"",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "query\": \"Perplexity company founding date\"}}",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "Perplexity company founding date"
+              "metric": "total_tokens",
+              "span_id": "vzNuoz4e",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:45.792544+00:00",
+                "__module__": "datetime"
               },
-              "call_id": "98d3790b-1b84-4ab7-ad66-117fea68d5db",
-              "tool_name": "knowledge_search"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
+              "trace_id": "vNRMmadcTVmfkn5-",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 109
+            }
+          ]
+        }
       }
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was Perplexity the company founded?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}": {
     "chunks": [
       {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
+          "metrics": null
+        }
       },
       {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " \"parameters\": {\"query\": \"Perplexity company founding",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " date\"}}",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "Perplexity company founding date"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
               },
-              "call_id": "6add8292-f388-4ec5-8ec5-5071c9397492",
-              "tool_name": "knowledge_search"
+              "tool_call": "",
+              "type": "tool_call"
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was the nba created?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'NBA creation date'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " NBA was created on August ",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "3, 1949, with",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the merger of the Basketball Association of",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " America (BAA) and the National Basketball League",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " (NBL).",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was the nba created?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'NBA creation date'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " NBA was created on August 3, ",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "1949, with the merger of the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Basketball Association of America (BAA) and the National Basketball",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " League (NBL).",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was the nba created?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'when was the nba created'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'when was the nba created'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " NBA was created on August 3, 1949,",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " with the merger of the Basketball Association of America",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " (BAA) and the National Basketball",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " League (NBL).",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was the nba created?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'when was the nba created'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "{\"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "type\": \"function\", \"name\":",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " \"knowledge_search\", \"parameters\": {\"query",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "\": \"when was the nba created\"}}",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
             },
-            "tool_call": {
-              "arguments": {
-                "query": "when was the nba created"
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
               },
-              "call_id": "c132966d-e4be-47de-9512-7e9e2e6d896c",
-              "tool_name": "knowledge_search"
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point\",",
+              "type": "tool_call"
             },
-            "type": "tool_call"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
+          "metrics": null
+        }
       },
       {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was the nba created?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "{\"type\": \"function\", \"name",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\": \"knowledge_search\", \"parameters",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\": {\"query\": \"when was",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " the nba created\"}}",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "when was the nba created"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
               },
-              "call_id": "0145ecf7-ff15-4e06-8684-d9c60e0e2966",
-              "tool_name": "knowledge_search"
+              "tool_call": " \"parameters\": {\"liquid_name\": \"polyjuice\", \"celci",
+              "type": "tool_call"
             },
-            "type": "tool_call"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
+          "metrics": null
+        }
       },
       {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.1-8B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was the nba created?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\", \"parameters\": {\"query\": \"NBA creation date\"}}",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "NBA creation date"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
               },
-              "call_id": "f50656dd-201d-44b0-8f9f-ca88b970b3fd",
-              "tool_name": "knowledge_search"
+              "tool_call": "us\": \"true\"}}",
+              "type": "tool_call"
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant Always respond with tool calls no matter what. '), UserMessage(role='user', content='Get the boiling point of polyjuice with a tool call.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='get_boiling_point', arguments={'liquid_name': 'polyjuice', 'celcius': True})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='get_boiling_point', content='-100')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " provided function definitions are not suitable for",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " this task",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ". Please re",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "work them to",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " align with the task requirements.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant Always respond with tool calls no matter what. '), UserMessage(role='user', content='Get the boiling point of polyjuice with a tool call.', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "[",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "get_boiling_point(liquid_name='polyjuice",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "', celcius=True)]",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
             },
-            "tool_call": {
-              "arguments": {
-                "celcius": true,
-                "liquid_name": "polyjuice"
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
               },
-              "call_id": "1fc2d874-894e-4857-ae2b-7aacc75c330e",
-              "tool_name": "get_boiling_point"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Call get_boiling_point and answer What is the boiling point of polyjuice?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='get_boiling_point', arguments={'liquid_name': 'polyjuice', 'celcius': True})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='get_boiling_point', content='-100')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " function call returned an error",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " since \"polyjuice\" is",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " not a real liquid. Polyjuice is a fictional substance",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " from the Harry Potter series. The boiling",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " point of a liquid is a physical",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " property that can be measured and",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " quantified, but it only applies",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " to real substances that exist in the physical world.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Call get_boiling_point and answer What is the boiling point of polyjuice?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "[",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "get_boiling_point(liquid_name='polyjuice",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "', celcius=True)]",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "celcius": true,
-                "liquid_name": "polyjuice"
+              "tool_call": {
+                "arguments": {
+                  "celcius": "true",
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "98d5962a-eab3-4d83-bca4-d4d6aa54f1dc",
+                "tool_name": "get_boiling_point"
               },
-              "call_id": "7d72d1ae-9f52-40c7-8dc5-48fff52b253a",
-              "tool_name": "get_boiling_point"
+              "type": "tool_call"
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Give me a sentence that contains the word: hello', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "When",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " I answered the phone, the friendly",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " voice on the other end said \"hello\" and asked how I was doing",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)\\n# Sample of data\\nprint(\"Data sample from file:\")\\nprint(df.head())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " error message indicates that the file 'bwrap' was not",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " found. This is likely because the file path provided is incorrect or the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " file does not exist in the specified location.\n\nTo resolve",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " this issue, you should ensure that",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the file path is correct and the file exists in the specified location. If",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the file is located in a different directory, you should",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " provide the correct file path.\n\nAdditionally, you can",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " use the `os` module to check if the file exists before attempting",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " to read it. Here",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "'s an example:\n\n```python\nimport os\nimport",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " pandas as pd\n\nfile_path = \"/var/folders",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "/rb/qv8vwgyj6yjd3t4p",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "wsy9t0rm0000gn/T/tmp4n_d_h",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "5o/u4yh2j11inflation.csv\"\n\nif",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " os.path.isfile(file_path):\n    df =",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " pd.read_csv(file_path)\n    print",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "(\"Number of rows and columns in the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " data:\", df.shape)\n    print(\"Columns of the data are:\", len",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "(df.columns))\n    print(\"Columns of the data are:\", df.columns)\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "    print(\"Datatype of the columns are:\", df.dtypes)\n   ",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " print(\"Data sample from file:\")\n    print(df.head())\nelse:\n   ",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " print(\"The file does not exist\")\n```\n\nThis code checks if",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the file exists before attempting to read it. If the file does not exist",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ", it prints a message indicating that the file does not exist.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv file, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
             },
-            "tool_call": "",
-            "type": "tool_call"
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
+          "metrics": null
+        }
       },
       {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
             },
-            "tool_call": "import pandas as pd\n# Load",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
             },
-            "tool_call": " data\ndf = pd.read_csv(\"/var/folders/rb/qv",
-            "type": "tool_call"
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "8vwgyj6yjd3t4pwsy9t",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "0rm0000gn/T/tmp4n_d_h5o/u4",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "yh2j11inflation.csv\")\n# Rows\nprint(\"Number of",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " rows and columns in the data:\", df.shape)\n# Columns\nprint(\"",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "Columns of the data are:\", len(df.columns))\n# Column names\nprint",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "(\"Columns of the data are:\", df.columns)\n# Column dtypes\n",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "print(\"Datatype of the columns are:\", df.dtypes)\n#",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp4n_d_h5o/u4yh2j11inflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())"
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
               },
-              "call_id": "517038eb-c373-441b-96fe-3a0e2f063fc0",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\"), CompletionMessage(role='assistant', content='The error message indicates that the file \"<TEMP_FILE>\" does not exist. This could be due to a number of reasons such as the file being deleted, the path being incorrect, or the file being moved to a different location.\\n\\nTo resolve this issue, you should ensure that the file exists and the path is correct. If the file does exist, you can try to load it using the correct path. If the file does not exist, you will need to create it or obtain it from the relevant source.\\n\\nHere is an example of how you can modify the code to handle this situation:\\n\\n```\\nimport pandas as pd\\n\\n# Define the path to the CSV file\\nfile_path = \"<TEMP_FILE>\"\\n\\n# Check if the file exists\\nimport os\\nif os.path.isfile(file_path):\\n    # Load the CSV file\\n    df = pd.read_csv(file_path)\\n\\n    # Print the first few rows of the dataframe\\n    print(df.head())\\n\\n    # Print information about the dataframe\\n    print(df.info())\\n\\n    # Print summary statistics about the dataframe\\n    print(df.describe())\\nelse:\\n    print(\"The file does not exist.\")\\n```\\n\\nThis code will check if the file exists before attempting to load it. If the file does not exist, it will print a message indicating that the file does not exist.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Convert the \\'Year\\' column to datetime\\ndf[\\'Year\\'] = pd.to_datetime(df[\\'Year\\'], format=\\'%Y\\')\\n\\n# Group by \\'Year\\' and calculate the average inflation\\ndf_avg_inflation = df.groupby(\\'Year\\')[\\'Inflation\\'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation[\\'Year\\'], df_avg_inflation[\\'Inflation\\'], marker=\\'o\\')\\nplt.title(\\'Average Yearly Inflation\\')\\nplt.xlabel(\\'Year\\')\\nplt.ylabel(\\'Inflation\\')\\nplt.grid(True)\\nplt.show()'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " error message indicates that the file \"/var/folders/rb/qv8",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "vwgyj6yjd3t4pwsy9t0",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "rm0000gn/T/tmpbb210725/duWDtjG",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "ninflation.csv\" does not exist. This could be due to a number",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " of reasons such as the file being deleted, the path being incorrect, or",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the file being moved to a different location.\n\nTo resolve this issue, you",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " should ensure that the file exists and the path is correct. If the file",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " does exist, you can try to load it using the correct path. If",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the file does not exist, you will need to create it or obtain it",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " from the relevant source.\n\nHere is an example of how you can modify the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " code to handle this situation:\n\n```\nimport pandas as pd\nimport matplotlib",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".pyplot as plt\n\n# Define the path to the CSV file\nfile_path",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " = \"/var/folders/rb/qv8vwgyj6y",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "jd3t4pwsy9t0rm0000gn/T",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "/tmpbb210725/duWDtjGninflation.csv\"\n\n#",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Check if the file exists\nimport os\nif os.path.isfile(file_path",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "):\n    # Load the CSV file\n    df = pd.read_csv(file",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "_path)\n\n    # Convert the 'Year' column to datetime\n    df",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "    # Group by 'Year' and calculate the average inflation\n    df",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "_avg_inflation = df.groupby('Year')['Inflation'].mean().reset",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "_index()\n\n    # Plot the average yearly inflation as a time series\n   ",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " plt.figure(figsize=(10,6))\n    plt.plot(df_avg_inflation",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "['Year'], df_avg_inflation['Inflation'], marker='o')\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "    plt.title('Average Yearly Inflation')\n    plt.xlabel('Year",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "')\n    plt.ylabel('Inflation')\n    plt.grid(True)\n    plt",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".show()\nelse:\n    print(\"The file does not exist.\")\n```\n\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "This code will check if the file exists before attempting to load it. If",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the file does not exist, it will print a message indicating that the file",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " does not exist.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\"), CompletionMessage(role='assistant', content='The error message indicates that the file \"<TEMP_FILE>\" does not exist. This could be due to a number of reasons such as the file being deleted, the path being incorrect, or the file being moved to a different location.\\n\\nTo resolve this issue, you should ensure that the file exists and the path is correct. If the file does exist, you can try to load it using the correct path. If the file does not exist, you will need to create it or obtain it from the relevant source.\\n\\nHere is an example of how you can modify the code to handle this situation:\\n\\n```\\nimport pandas as pd\\n\\n# Define the path to the CSV file\\nfile_path = \"<TEMP_FILE>\"\\n\\n# Check if the file exists\\nimport os\\nif os.path.isfile(file_path):\\n    # Load the CSV file\\n    df = pd.read_csv(file_path)\\n\\n    # Print the first few rows of the dataframe\\n    print(df.head())\\n\\n    # Print information about the dataframe\\n    print(df.info())\\n\\n    # Print summary statistics about the dataframe\\n    print(df.describe())\\nelse:\\n    print(\"The file does not exist.\")\\n```\\n\\nThis code will check if the file exists before attempting to load it. If the file does not exist, it will print a message indicating that the file does not exist.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " file\ndf = pd.read_csv(\"/var/folders/rb/qv",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "8vwgyj6yjd3t4pwsy9t",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "0rm0000gn/T/tmpbb210725/duWDtj",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "Gninflation.csv\")\n\n# Convert the 'Year' column to datetime\n",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "df['Year'] = pd.to_datetime(df['Year'], format",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "='%Y')\n\n# Group by 'Year' and calculate",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " the average inflation\ndf_avg_inflation = df.groupby('",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " inflation as a time series\nplt.figure(figsize=(10",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": ",6))\nplt.plot(df_avg_inflation['Year'], df_avg_in",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "flation['Inflation'], marker='o')\nplt.title('Average Yearly",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": ".grid(True)\nplt.show()",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpbb210725/duWDtjGninflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()"
+              "metric": "prompt_tokens",
+              "span_id": "1A0bWgLL",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:24.102366+00:00",
+                "__module__": "datetime"
               },
-              "call_id": "a6646608-a943-4849-884e-1852d5ef4a7e",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
+              "trace_id": "4a5HMcM9R3uWB4Cv",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 37
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\"), CompletionMessage(role='assistant', content='The error message indicates that the file \"<TEMP_FILE>\" does not exist. This could be due to a number of reasons such as the file being deleted, the path being incorrect, or the file being in a different location.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Make sure the file path is correct and the file exists in the specified location.\\n2. Use a relative path: If the file is in the same directory as your Python script, you can use a relative path instead of an absolute path.\\n3. Check file permissions: Make sure you have the necessary permissions to read the file.\\n4. Use a try-except block: You can use a try-except block to catch the FileNotFoundError and handle it accordingly.\\n\\nHere is an example of how you can modify the code to handle the FileNotFoundError:\\n\\n```\\nimport pandas as pd\\n\\ntry:\\n    df = pd.read_csv(\"<TEMP_FILE>\")\\n    print(df.head())\\n    print(df.info())\\n    print(df.describe())\\nexcept FileNotFoundError:\\n    print(\"The file does not exist\")\\n```\\n\\nThis code will print \"The file does not exist\" if the file is not found, instead of raising an error.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Convert the \\'Year\\' column to datetime\\ndf[\\'Year\\'] = pd.to_datetime(df[\\'Year\\'], format=\\'%Y\\')\\n\\n# Group by \\'Year\\' and calculate the average inflation\\ndf_avg_inflation = df.groupby(\\'Year\\')[\\'Inflation\\'].mean().reset_index()\\n\\n# Plot the average inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation[\\'Year\\'], df_avg_inflation[\\'Inflation\\'], marker=\\'o\\')\\nplt.title(\\'Average Yearly Inflation\\')\\nplt.xlabel(\\'Year\\')\\nplt.ylabel(\\'Inflation\\')\\nplt.grid(True)\\nplt.show()'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " error message indicates that the file \"/var/folders",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "/rb/qv8vwgyj6y",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "jd3t4pwsy9t0",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "rm0000gn/T/tmpdcpkc9",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "_f/FKWQnYoVinflation.csv\"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " does not exist. This could be due to a",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " number of reasons such as the file being deleted,",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the path being incorrect, or the file being in",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " a different location.\n\nTo resolve this issue, you can try the following:\n\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "1. Check the file",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " path: Make sure the file path is correct and",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the file exists in the specified location.\n2. Use a relative path:",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " If the file is in the same directory as your Python script, you can",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " use a relative path instead of an absolute path.\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "3. Check file permissions: Make sure you have the necessary permissions to read",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the file.\n4. Use a try-except block: You can use",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " a try-except block to catch the FileNotFoundError and handle it accordingly.\n\nHere",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " is an example of how you can modify the code to handle the FileNotFoundError:\n\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "```\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\ntry:\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "    df = pd.read_csv(\"/var/folders/rb/q",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "v8vwgyj6yjd3t4pwsy9",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "t0rm0000gn/T/tmpdcpkc9_f/FKW",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "QnYoVinflation",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".csv\")\n    df['Year'] = pd.to",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "_datetime(df['Year'], format='%Y')\n    df_avg_inflation =",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " df.groupby('Year')['Inflation'].mean().reset_index()\n    plt",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".figure(figsize=(10,6))\n    plt.plot(df_avg_inflation['",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "Year'], df_avg_inflation['Inflation'], marker='o')\n   ",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " plt.title('Average Yearly Inflation')\n   ",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " plt.xlabel('Year')\n    plt.ylabel('Inflation')\n    plt.grid",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "(True)\n    plt.show()\nexcept FileNotFoundError:\n    print(\"The file does",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " not exist\")\n```\n\nThis code will print \"The file does not exist",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "\" if the file is not found, instead of raising an error.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\"), CompletionMessage(role='assistant', content='The error message indicates that the file \"<TEMP_FILE>\" does not exist. This could be due to a number of reasons such as the file being deleted, the path being incorrect, or the file being in a different location.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Make sure the file path is correct and the file exists in the specified location.\\n2. Use a relative path: If the file is in the same directory as your Python script, you can use a relative path instead of an absolute path.\\n3. Check file permissions: Make sure you have the necessary permissions to read the file.\\n4. Use a try-except block: You can use a try-except block to catch the FileNotFoundError and handle it accordingly.\\n\\nHere is an example of how you can modify the code to handle the FileNotFoundError:\\n\\n```\\nimport pandas as pd\\n\\ntry:\\n    df = pd.read_csv(\"<TEMP_FILE>\")\\n    print(df.head())\\n    print(df.info())\\n    print(df.describe())\\nexcept FileNotFoundError:\\n    print(\"The file does not exist\")\\n```\\n\\nThis code will print \"The file does not exist\" if the file is not found, instead of raising an error.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\n\n# Load the CSV file\ndf = pd",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": ".read_csv(\"/var/folders/rb/qv",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "8vwgyj6yjd3t4",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "pwsy9t0rm0000gn",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "/T/tmpdcpkc9_f/FKWQ",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "nYoVinflation.csv\")\n\n# Convert the '",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "Year' column to datetime\ndf['Year'] = pd",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": ".to_datetime(df['Year'], format",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_in",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "flation['Inflation'], marker='o')\nplt.title('Average Yearly",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " Inflation')\nplt.xlabel('Year')\nplt.ylabel",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "('Inflation')\nplt.grid(True)\nplt.show()",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpdcpkc9_f/FKWQnYoVinflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()"
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
               },
-              "call_id": "619c3b2c-3e23-485f-85bd-38a5ecf398b2",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\"), CompletionMessage(role='assistant', content='The error message indicates that the file \"<TEMP_FILE>\" does not exist. This could be due to a number of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you should ensure that the file exists and the path is correct. If the file does not exist, you will need to create it or obtain it from the relevant source. If the path is incorrect, you will need to update the path to the correct location of the file.\\n\\nAdditionally, you can use the `os` module to check if the file exists before trying to load it:\\n\\n```\\nimport os\\nimport pandas as pd\\n\\nfile_path = \"<TEMP_FILE>\"\\n\\nif os.path.isfile(file_path):\\n    df = pd.read_csv(file_path)\\n    print(df.head())\\n    print(df.info())\\n    print(df.describe())\\nelse:\\n    print(\"The file does not exist\")\\n```\\n\\nThis code will check if the file exists before trying to load it, and will print a message if the file does not exist.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Convert the \\'Year\\' column to datetime\\ndf[\\'Year\\'] = pd.to_datetime(df[\\'Year\\'], format=\\'%Y\\')\\n\\n# Group by \\'Year\\' and calculate the average inflation\\ndf_avg_inflation = df.groupby(\\'Year\\')[\\'Inflation\\'].mean().reset_index()\\n\\n# Plot the average inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation[\\'Year\\'], df_avg_inflation[\\'Inflation\\'], marker=\\'o\\')\\nplt.title(\\'Average Yearly Inflation\\')\\nplt.xlabel(\\'Year\\')\\nplt.ylabel(\\'Inflation\\')\\nplt.grid(True)\\nplt.show()'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=<ToolPromptFormat.python_list: 'python_list'>, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', <ToolPromptFormat.python_list: 'python_list'>), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " error message indicates that the file \"/var/folders/rb/qv8",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "vwgyj6yjd3t4pwsy9t0",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "rm0000gn/T/tmp5zsm1ywy/RKBk",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "Al1zinflation.csv\" does not exist. This could be due to",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " a number of reasons such as the file being deleted, the path being incorrect",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ", or the file not being accessible.\n\nTo resolve this issue, you should",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " ensure that the file exists and the path is correct. If the file does",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " not exist, you will need to create it or obtain it from the relevant",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " source. If the path is incorrect, you will need to update the path",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " to the correct location of the file.\n\nAdditionally, you can use the `",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "os` module to check if the file exists before trying to load it:\n\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "```\nimport os\nimport pandas as pd\nimport",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " matplotlib.pyplot as plt\n\nfile_path = \"/var/folders/rb/q",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "v8vwgyj6yjd3t4pwsy9",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "t0rm0000gn/T/tmp5zsm1ywy/R",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "KBkAl1zinflation.csv\"\n\nif os.path.isfile(file_path):\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "    df = pd.read_csv(file_path)\n    df['",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "Year'] = pd.to_datetime(df['Year'], format='%Y')\n   ",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " df_avg_inflation = df.groupby('Year')['Inflation'].mean().",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "reset_index()\n    plt.figure(figsize=(10,6))\n    plt.plot",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "(df_avg_inflation['Year'], df_avg_inflation['Inflation'],",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " marker='o')\n    plt.title('Average Yearly Inflation')\n   ",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " plt.xlabel('Year')\n    plt.ylabel('Inflation')\n    plt.grid",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "(True)\n    plt.show()\nelse:\n    print(\"The file does not",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " exist\")\n```\n\nThis code will check if the file exists before trying to",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " load it, and will print a message if the file does not exist.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\"), CompletionMessage(role='assistant', content='The error message indicates that the file \"<TEMP_FILE>\" does not exist. This could be due to a number of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you should ensure that the file exists and the path is correct. If the file does not exist, you will need to create it or obtain it from the relevant source. If the path is incorrect, you will need to update the path to the correct location of the file.\\n\\nAdditionally, you can use the `os` module to check if the file exists before trying to load it:\\n\\n```\\nimport os\\nimport pandas as pd\\n\\nfile_path = \"<TEMP_FILE>\"\\n\\nif os.path.isfile(file_path):\\n    df = pd.read_csv(file_path)\\n    print(df.head())\\n    print(df.info())\\n    print(df.describe())\\nelse:\\n    print(\"The file does not exist\")\\n```\\n\\nThis code will check if the file exists before trying to load it, and will print a message if the file does not exist.', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " file\ndf = pd.read_csv(\"/var/folders/rb/qv",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "8vwgyj6yjd3t4pwsy9t",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "0rm0000gn/T/tmp5zsm1ywy/RKB",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "kAl1zinflation.csv\")\n\n# Convert the 'Year'",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " column to datetime\ndf['Year'] = pd.to_datetime(df['Year",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "'], format='%Y')\n\n# Group by",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " 'Year' and calculate the average inflation\ndf_avg_in",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "flation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "# Plot the average inflation as a time series\nplt.figure(figsize=(10",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": ",6))\nplt.plot(df_avg_inflation['Year'], df_avg_in",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "flation['Inflation'], marker='o')\nplt.title('Average Yearly",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": ".grid(True)\nplt.show()",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp5zsm1ywy/RKBkAl1zinflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()"
+              "metric": "completion_tokens",
+              "span_id": "1A0bWgLL",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:24.102404+00:00",
+                "__module__": "datetime"
               },
-              "call_id": "61b988d6-45f4-4147-8b62-69c3abbb03a9",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
+              "trace_id": "4a5HMcM9R3uWB4Cv",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\"), CompletionMessage(role='assistant', content='The error message indicates that the file \"<TEMP_FILE>\" does not exist. This could be due to a number of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you should ensure that the file exists and the path is correct. If the file does not exist, you will need to create it or obtain it from the relevant source. If the path is incorrect, you will need to update the path to the correct location of the file.\\n\\nAdditionally, you can use the `os` module to check if the file exists before trying to read it. Here is an example:\\n\\n```\\nimport os\\nimport pandas as pd\\n\\nfile_path = \"<TEMP_FILE>\"\\n\\nif os.path.isfile(file_path):\\n    df = pd.read_csv(file_path)\\n    print(df.head())\\n    print(df.info())\\n    print(df.describe())\\nelse:\\n    print(\"The file does not exist\")\\n```\\n\\nThis code will check if the file exists before trying to read it. If the file does not exist, it will print \"The file does not exist\".', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Convert the \\'Year\\' column to datetime\\ndf[\\'Year\\'] = pd.to_datetime(df[\\'Year\\'], format=\\'%Y\\')\\n\\n# Group by \\'Year\\' and calculate the average inflation\\ndf_avg_inflation = df.groupby(\\'Year\\')[\\'Inflation\\'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation[\\'Year\\'], df_avg_inflation[\\'Inflation\\'], marker=\\'o\\')\\nplt.title(\\'Average Yearly Inflation\\')\\nplt.xlabel(\\'Year\\')\\nplt.ylabel(\\'Inflation\\')\\nplt.grid(True)\\nplt.show()'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=<ToolPromptFormat.python_list: 'python_list'>, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', <ToolPromptFormat.python_list: 'python_list'>), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " error message indicates that the file \"/var/folders/rb/qv8",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "vwgyj6y",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "jd3t4pwsy9t0",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "rm0000gn/T/tmp1ugde3u9/FSj",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "wY288inflation.csv\" does not exist. This could be due",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " to a number of reasons such as the file being deleted, the path being",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " incorrect, or the file not being accessible.\n\nTo resolve this issue, you",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " should ensure that the file exists and the path is correct. If the file",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " does not exist, you will need to create it",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " or obtain it from the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " relevant source. If the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " path is incorrect, you will need to update the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " path to the correct location of the file.\n\nAdditionally",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ", you can use the `os` module to",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " check if the file exists before trying to read it",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ". Here is an example:\n\n```\nimport os",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "file_path = \"/var/folders/rb/qv8vwgyj",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "6yjd3t4pwsy9t0rm0000",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "gn/T/tmp1ugde3u9/FSjwY288",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "inflation.csv\"\n\nif os.path.isfile(file_path):\n    df = pd",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".read_csv(file_path)\n    df['Year'] = pd.to_datetime(df",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "['Year'], format='%Y')\n    df_avg",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "_inflation = df.groupby('Year')['Inflation",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "'].mean().reset_index()\n    plt.figure(figsize=(10,6))\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "    plt.plot(df_avg_inflation['Year'], df_avg_inflation['",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "Inflation'], marker='o')\n    plt.title('Average Yearly In",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "flation')\n    plt.xlabel('Year')\n    plt",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".ylabel('Inflation')\n    plt.grid(True)\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "    plt.show()\nelse:\n    print(\"The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " file does not exist\")\n```\n\nThis code will",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " check if the file exists before trying to read it. If the file does",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " not exist, it will print \"The file does not exist\".",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\"), CompletionMessage(role='assistant', content='The error message indicates that the file \"<TEMP_FILE>\" does not exist. This could be due to a number of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you should ensure that the file exists and the path is correct. If the file does not exist, you will need to create it or obtain it from the relevant source. If the path is incorrect, you will need to update the path to the correct location of the file.\\n\\nAdditionally, you can use the `os` module to check if the file exists before trying to read it. Here is an example:\\n\\n```\\nimport os\\nimport pandas as pd\\n\\nfile_path = \"<TEMP_FILE>\"\\n\\nif os.path.isfile(file_path):\\n    df = pd.read_csv(file_path)\\n    print(df.head())\\n    print(df.info())\\n    print(df.describe())\\nelse:\\n    print(\"The file does not exist\")\\n```\\n\\nThis code will check if the file exists before trying to read it. If the file does not exist, it will print \"The file does not exist\".', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Plot average yearly inflation as a time series', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " file\ndf = pd.read_csv(\"/var/folders/rb/qv",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "8vwgyj6yjd3t4pwsy9t",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "0rm0000gn/T/tmp1ugde3u9/FS",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "jwY288inflation.csv\")\n\n# Convert the 'Year' column",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " to datetime\ndf['Year'] = pd.to_datetime(df['Year'],",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " format='%Y')\n\n# Group by",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " 'Year' and calculate the average inflation\ndf_avg_in",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "flation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "# Plot the average yearly inflation as a time series\n",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "plt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "Year'], df_avg_inflation['Inflation'], marker='o')\nplt",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": ".title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "('Inflation')\nplt.grid(True)\nplt.show()",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp1ugde3u9/FSjwY288inflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()"
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
               },
-              "call_id": "da5760dd-614a-4c19-954c-b4e354e75d79",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " error message indicates that the file \"/var/folders/rb/qv8",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "vwgyj6yjd3t4pwsy9t0",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "rm0000gn/T/tmpbb210725/duWDtjG",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "ninflation.csv\" does not exist. This could be",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " due to a number of reasons such as the file being deleted,",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the path being",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " incorrect, or the file being moved to a different location.\n\nTo",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " resolve this issue, you should ensure that",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the file exists and the path is correct. If the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " file does exist, you can try to load",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " it using the correct path. If the file does",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " not exist, you will need to create it or obtain",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " it from the relevant",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " source.\n\nHere is an example of",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " how you can modify the code to handle this situation:\n\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "```\nimport pandas as pd\n\n# Define the path to the CSV file",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "\nfile_path = \"/var/folders/rb/qv8",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "vwgyj6yjd3t4pwsy9t0",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "rm0000gn/T/tmpbb210725/duWDtjG",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "ninflation.csv\"\n\n# Check if the file exists\nimport os",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "\nif os.path.isfile(file_path):\n    # Load",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the CSV file\n    df = pd.read_csv(file_path)\n\n   ",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " # Print the first few rows of the dataframe\n    print(df.head())\n\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "    # Print information about",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the dataframe\n    print(df.info())\n\n    # Print summary statistics about the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " dataframe\n    print(df.describe())\nelse:\n    print(\"The file does",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " not exist.\")\n```\n\nThis code will check if the file exists before",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " attempting to load it. If the file does not exist, it will print",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " a message indicating that the file does not exist.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=<ToolPromptFormat.python_list: 'python_list'>, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', <ToolPromptFormat.python_list: 'python_list'>), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " error message indicates that the file \"/var/folders",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "/rb/qv8",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "vwgyj6yjd3t4pwsy9t0",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "rm0000gn/T/tmp5zsm1ywy/RKBk",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "Al1zinflation.csv\" does not exist. This could be",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " due to a number of reasons such as the file being deleted, the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " path being incorrect, or the file",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " not being accessible.\n\nTo resolve this issue, you should ensure",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " that the file exists and the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " path is correct. If the file does not",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " exist, you will need to create it or obtain it",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " from the relevant",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " source. If the path is",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " incorrect, you will need to update the path",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " to the correct location of the file.\n\nAdditionally,",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " you can use the `os` module to",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " check if the file exists before trying to load it:\n\n``",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "`\nimport os\nimport pandas as pd\n\nfile_path",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " = \"/var/folders",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "/rb/qv8vwgyj6y",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "jd3t4pwsy9t0rm0000gn/T",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "/tmp5zsm1ywy/RKBkAl1zinflation.csv",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "\"\n\nif os.path.isfile(file_path):\n    df =",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " pd.read_csv(file_path)\n    print(df.head())\n    print",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "(df.info())\n    print(df.describe())\nelse:\n    print(\"The file",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " does not exist\")\n```\n\nThis code will check if the file exists before",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " trying to load it, and will print a message if",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the file does not exist.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')]), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " error message indicates that the file \"/var/folders/rb/qv8",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "vwgyj6yjd3t4p",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "wsy9t0rm0000gn/T/tmpdcpkc9",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "_f/FKWQnYoVinflation.csv\"",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " does not exist. This could be",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " due to a number of reasons such as the file being deleted, the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " path being incorrect, or the file",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " being in a different location.\n\nTo resolve this issue, you can try",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the following:\n\n1. Check the file path",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ": Make sure the file path is correct and the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " file exists in the specified location.\n2. Use a",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " relative path",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ": If the file is in the same directory as",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " your Python script, you can use",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " a relative path instead of",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " an absolute path.\n3. Check file permissions",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ": Make sure you have the necessary permissions to read the file.\n4.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Use a try-except block: You can use a try-except",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " block to catch the FileNotFoundError and handle it accordingly.\n\nHere is an example of",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " how you can modify the code to handle the FileNotFoundError:\n\n```\nimport pandas",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " as pd\n\ntry:\n    df =",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " pd.read_csv(\"/var/folders/rb/qv8vwgyj",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "6yjd3t",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "4pwsy9t0rm0000",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "gn/T/tmpdcpkc9_f/FKWQnYoVinflation",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".csv\")\n    print(df.head())\n    print(df.info())\n    print(df",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".describe())\nexcept FileNotFoundError:\n    print(\"The file does not exist\")\n``",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "`\n\nThis code will print \"The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " file does not exist\" if the file is not found, instead",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " of raising an error.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Here is a csv, can you describe it?', context=None), ToolResponseMessage(role='tool', call_id='', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=[TextContentItem(type='text', text='# User provided a file accessible to you at \"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "/rb/qv8vwgyj6yjd3t4pwsy9t0",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "rm0000gn/T/tmpbb210725/duWDtjGninflation.csv\")\n\n#",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " Print the first few rows of the dataframe\nprint(df.head())\n\n#",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " Print information about",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " the dataframe\nprint(df.info())\n\n# Print summary statistics about the",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " dataframe\nprint(df.describe())",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpbb210725/duWDtjGninflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print information about the dataframe\nprint(df.info())\n\n# Print summary statistics about the dataframe\nprint(df.describe())"
+              "metric": "total_tokens",
+              "span_id": "1A0bWgLL",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:24.102411+00:00",
+                "__module__": "datetime"
               },
-              "call_id": "3ab348fd-a9b8-47d7-be10-7d38159c9a0d",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
+              "trace_id": "4a5HMcM9R3uWB4Cv",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 47
+            }
+          ]
+        }
       }
     ],
     "type": "generator"
   },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:3e3a0\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:7da0c\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:fd0f6\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:7da0c\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:fd0f6\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help. What's your first question about Torchtune?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'using LoRA in Torchtune'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:7da0c\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:7da0c\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:7da0c\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:7da0c\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune\\'s LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we\\'ve loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\"\"\"\\n    {total_params} total params,\\n    {trainable_params}\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \"\"\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:7da0c\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet\\'s run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=[\\'q_proj\\',\\'k_proj\\',\\'v_proj\\',\\'output_proj\\'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune\\'s :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA\\'s low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}": {
     "chunks": [
       {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "To",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " use LoRA in Torchtune, you can follow",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " these steps:\n\n1. Import the necessary modules: `",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "from torchtune.models.llama2 import llama2_7b",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ", lora_llama2_7b`\n2. Create a",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Llama2 model with LoRA: `lora",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "_model = lora_llama2_7b(lora_attn_modules",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "=[\"q_proj\", \"v_proj\"])`\n3. Load the pre-trained",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Llama2 weights into the LoRA",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " model: `lora_model.load_state_dict(base_model.state_dict(), strict",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "=False)`\n4. Set only LoRA parameters to trainable:",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " `from torchtune.modules.peft.peft_utils import get",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "_adapter_params, set_trainable_params`\n5. Run the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " LoRA finetune using torchtune's Lo",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "RA recipe: `tune run --",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "nnodes 1 --nproc_per_node 2 lora_finet",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "une_distributed --config llama2/7B_lora`\n\nYou can",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " also experiment with different LoRA configurations, such as applying Lo",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "RA to all linear layers in the self-attention, increasing",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the rank, and scaling alpha and rank together.\n\nNote: You",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " need to have the Llama2 weights and tokenizer downloaded",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " and installed, and you need to have the necessary dependencies",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " installed, including torchtune",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " and PyTorch.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:3e3a0\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:7da0c\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:fd0f6\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:7da0c\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:fd0f6\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help. What's your first question about Torchtune?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "[k",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "nowledge_search(query=\"using LoRA in Torchtune",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "\")]",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
             },
-            "tool_call": {
-              "arguments": {
-                "query": "using LoRA in Torchtune"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
               },
-              "call_id": "62b19206-ed9f-42d1-a614-1582d8598193",
-              "tool_name": "knowledge_search"
+              "tool_call": "",
+              "type": "tool_call"
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:3e3a0\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:7da0c\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:fd0f6\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:7da0c\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:fd0f6\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "I",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "'m ready to help. What",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "'s your first question about Torchtune?",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:f76dc\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:c4fc3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:de2d4\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:c4fc3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:de2d4\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help. What's your first question about Torchtune?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'using LoRA in Torchtune'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text=\"Result 1:\\nDocument_id:c4fc3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 2:\\nDocument_id:c4fc3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet\\'s inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer\\'s self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:c4fc3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:c4fc3\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune\\'s LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we\\'ve loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\"\"\"\\n    {total_params} total params,\\n    {trainable_params}\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \"\"\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:c4fc3\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet\\'s run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=[\\'q_proj\\',\\'k_proj\\',\\'v_proj\\',\\'output_proj\\'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune\\'s :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA\\'s low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "To",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " use LoRA in Torchtune, you can follow",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " these steps:\n\n1. Import the necessary modules: `",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "from torch",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "tune.models.llama2 import llama2_7b",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ", lora_llama2_7b`\n2. Create a",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Llama2 model with LoRA: `lora",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "_model = lora_ll",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "ama2_7b",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "(lora_attn_modules=[\"q_proj\", \"v_proj\"])`\n3",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ". Load the pre-trained Llama2 weights into",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the LoRA model: `lora_model.load_state",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "_dict(base_model.state_dict(), strict=False)`\n4. Set only Lo",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "RA parameters to trainable: `from torchtune.modules.peft",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".peft_utils import get_adapter_params,",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " set_trainable_params`\n5. Run the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " LoRA finetune using torchtune's Lo",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "RA recipe: `tune run --",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "nnodes 1 --nproc_per_node 2 lora_finet",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "une_distributed --config llama2/7B_lora`\n\nYou can",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " also experiment with different LoRA configurations, such as applying Lo",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "RA to all linear layers in the self-attention, increasing the rank,",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " and scaling alpha and rank together.\n\n",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "Note: You need to have the pre-trained",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Llama2 weights and tokenizer downloaded and installed before running the LoRA fin",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "etune. Additionally, you can use torch",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "tune's `WandBLogger` to generate",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " loss curves and track the experiment's",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " progress.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:f76dc\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:c4fc3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:de2d4\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:c4fc3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:de2d4\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')]), CompletionMessage(role='assistant', content=\"I'm ready to help. What's your first question about Torchtune?\", stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[]), UserMessage(role='user', content='Tell me how to use LoRA', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "[k",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "nowledge_search(query=\"using LoRA in Torchtune",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "\")]",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
             },
-            "tool_call": {
-              "arguments": {
-                "query": "using LoRA in Torchtune"
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
               },
-              "call_id": "8413a252-8372-4061-a4a1-0a1d165dd373",
-              "tool_name": "knowledge_search"
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point_with",
+              "type": "tool_call"
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Torchtune documentation'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:f76dc\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:c4fc3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"), TextContentItem(type='text', text='Result 3:\\nDocument_id:de2d4\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='Result 4:\\nDocument_id:c4fc3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune\\'s factory settings, but we may want to experiment a bit.\\nLet\\'s take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: [\\'q_proj\\', \\'v_proj\\']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:de2d4\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "I",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "'m ready to help. What's",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " your first question about Torchtune?",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='I am attaching some documentation for Torchtune. Help me answer questions I will ask next.', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "[k",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "nowledge_search(query=\"Torchtune documentation\")]",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
             },
-            "tool_call": {
-              "arguments": {
-                "query": "Torchtune documentation"
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
               },
-              "call_id": "f21015ed-e70b-4a2b-a038-9335acbe0c53",
-              "tool_name": "knowledge_search"
+              "tool_call": "_metadata\", \"parameters\": {\"liquid_name\": \"polyjuice\", \"",
+              "type": "tool_call"
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Instead of the standard multi-head attention, what attention type does Llama3-8B use?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Llama3-8B attention type'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:num-1\\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:num-1\\nContent:  instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\\n\\n\\n.. code-block:: bash\\n\\n    tune download meta-llama/Meta-Llama-3\\n\"), TextContentItem(type='text', text=\"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights <llama3_label>`\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n    <s>[INST] <<SYS>>\\n    You are a helpful, respectful, and honest assistant.\\n    <</SYS>>\\n\\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\\n\\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\\n\"), TextContentItem(type='text', text='Result 4:\\nDocument_id:num-0\\nContent: \\'m Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let\\'s walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n    The Llama3 Base model uses a `different prompt template\\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n    Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet\\'s say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n    sample = [\\n        {\\n            \"role\": \"system\",\\n            \"\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:num-3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=<ToolPromptFormat.python_list: 'python_list'>, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', <ToolPromptFormat.python_list: 'python_list'>), ('tools', [ToolDefinition(tool_name='insert_into_memory', description='Insert documents into memory', parameters={}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "L",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "lama3-8B uses grouped-query",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " attention instead of the standard multi-head attention.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Instead of the standard multi-head attention, what attention type does Llama3-8B use?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Llama3-8B attention type'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:num-1\\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:num-1\\nContent:  instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\\n\\n\\n.. code-block:: bash\\n\\n    tune download meta-llama/Meta-Llama-3\\n\"), TextContentItem(type='text', text=\"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights <llama3_label>`\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n    <s>[INST] <<SYS>>\\n    You are a helpful, respectful, and honest assistant.\\n    <</SYS>>\\n\\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\\n\\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\\n\"), TextContentItem(type='text', text='Result 4:\\nDocument_id:num-0\\nContent: \\'m Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let\\'s walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n    The Llama3 Base model uses a `different prompt template\\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n    Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet\\'s say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n    sample = [\\n        {\\n            \"role\": \"system\",\\n            \"\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:num-3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=<ToolPromptFormat.python_list: 'python_list'>, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', <ToolPromptFormat.python_list: 'python_list'>), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "L",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "lama3-8B uses grouped-query attention instead of",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the standard multi-head attention.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Instead of the standard multi-head attention, what attention type does Llama3-8B use?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Llama3-8B attention type'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:num-1\\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:num-1\\nContent:  instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\\n\\n\\n.. code-block:: bash\\n\\n    tune download meta-llama/Meta-Llama-3\\n\"), TextContentItem(type='text', text=\"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights <llama3_label>`\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n    <s>[INST] <<SYS>>\\n    You are a helpful, respectful, and honest assistant.\\n    <</SYS>>\\n\\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\\n\\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\\n\"), TextContentItem(type='text', text='Result 4:\\nDocument_id:num-0\\nContent: \\'m Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let\\'s walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n    The Llama3 Base model uses a `different prompt template\\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n    Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet\\'s say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n    sample = [\\n        {\\n            \"role\": \"system\",\\n            \"\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:num-3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='insert_into_memory', description='Insert documents into memory', parameters={}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "L",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "lama3-8B uses grouped-query",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " attention instead of the standard multi-head attention.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Instead of the standard multi-head attention, what attention type does Llama3-8B use?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Llama3-8B attention type'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:num-1\\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-\\n'), TextContentItem(type='text', text=\"Result 2:\\nDocument_id:num-1\\nContent:  instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\\n\\n\\n.. code-block:: bash\\n\\n    tune download meta-llama/Meta-Llama-3\\n\"), TextContentItem(type='text', text=\"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights <llama3_label>`\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n    <s>[INST] <<SYS>>\\n    You are a helpful, respectful, and honest assistant.\\n    <</SYS>>\\n\\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\\n\\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\\n\"), TextContentItem(type='text', text='Result 4:\\nDocument_id:num-0\\nContent: \\'m Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let\\'s walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n    The Llama3 Base model uses a `different prompt template\\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n    Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet\\'s say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n    sample = [\\n        {\\n            \"role\": \"system\",\\n            \"\\n'), TextContentItem(type='text', text='Result 5:\\nDocument_id:num-3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet\\'s take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "L",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "lama3-8B uses grouped-query attention instead of",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the standard",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " multi-head attention.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Instead of the standard multi-head attention, what attention type does Llama3-8B use?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='insert_into_memory', description='Insert documents into memory', parameters={}), ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "[k",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "nowledge_search(query=\"Llama",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "3-8B attention type\")]",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
             },
-            "tool_call": {
-              "arguments": {
-                "query": "Llama3-8B attention type"
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
               },
-              "call_id": "bf3bf9f9-0e56-4720-a6a9-be8ad9e8dfcb",
-              "tool_name": "knowledge_search"
+              "tool_call": "celcius\": \"true\"}}",
+              "type": "tool_call"
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Instead of the standard multi-head attention, what attention type does Llama3-8B use?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "[k",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "nowledge_search(query=\"Llama",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "3-8B attention type\")]",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
             },
-            "tool_call": {
-              "arguments": {
-                "query": "Llama3-8B attention type"
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
               },
-              "call_id": "9c9a922f-afd6-4bc8-83ba-28211bb3fd29",
-              "tool_name": "knowledge_search"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Search the web and tell me who the current CEO of Meta is.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.brave_search: 'brave_search'>, arguments={'query': 'current CEO of Meta'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.brave_search: 'brave_search'>, content='{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\", \"score\": 0.8190992, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\", \"score\": 0.79099923, \"raw_content\": null}, {\"title\": \"Meet the Executive CSuite Team of Meta (Facebook) [2025]\", \"url\": \"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\", \"content\": \"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\", \"score\": 0.7602419, \"raw_content\": null}, {\"title\": \"Meta to spend up to $65 billion this year to power AI goals, Zuckerberg ...\", \"url\": \"https://www.reuters.com/technology/meta-invest-up-65-bln-capital-expenditure-this-year-2025-01-24/\", \"content\": \"Meta Platforms plans to spend as much as $65 billion this year to expand its AI infrastructure, CEO Mark Zuckerberg said on Friday, aiming to bolster the company\\'s position against rivals OpenAI\", \"score\": 0.73914057, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Forbes\", \"url\": \"https://www.forbes.com/profile/mark-zuckerberg/\", \"content\": \"Meta CEO Mark Zuckerberg \\\\u201cloved\\\\u201d an image on Facebook known as \\\\\"Challah Horse\\\\\" that happens to be AI-generated, highlighting the amount of AI spam on the platform. ### Meta Donates $1 Million To Trump\\\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President Elect Meta has donated $1 million to President-elect Donald Trump\\\\u2019s inaugural fund, the company confirmed to various news outlets on Wednesday, a move that comes just weeks after its CEO Mark Zuckerberg met with Trump at his Mar-a-Lago residence in an apparent bid to mend years of strained ties. ### Meta Donates $1 Million To Trump\\\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President-Elect Read the full profile on Forbes: https://www.forbes.com/sites/kerryadolan/2023/09/26/mark-gets-meta-zuckerberg-talks-ai-and-that-musk-mma-fight-thats-never-going-to-happen/?sh=671046e73037\", \"score\": 0.6410185, \"raw_content\": null}]}')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=<ToolPromptFormat.python_list: 'python_list'>, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', <ToolPromptFormat.python_list: 'python_list'>), ('tools', [ToolDefinition(tool_name=<BuiltinTool.brave_search: 'brave_search'>, description='Search the web for information', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " current CEO of Meta is Mark Zuckerberg.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Search the web and tell me who the current CEO of Meta is.', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.brave_search: 'brave_search'>, arguments={'query': 'current CEO of Meta'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.brave_search: 'brave_search'>, content='{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\", \"score\": 0.8190992, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\", \"score\": 0.79099923, \"raw_content\": null}, {\"title\": \"Meet the Executive CSuite Team of Meta (Facebook) [2025]\", \"url\": \"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\", \"content\": \"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\", \"score\": 0.7602419, \"raw_content\": null}, {\"title\": \"Meta to spend up to $65 billion this year to power AI goals, Zuckerberg ...\", \"url\": \"https://www.reuters.com/technology/meta-invest-up-65-bln-capital-expenditure-this-year-2025-01-24/\", \"content\": \"Meta Platforms plans to spend as much as $65 billion this year to expand its AI infrastructure, CEO Mark Zuckerberg said on Friday, aiming to bolster the company\\'s position against rivals OpenAI\", \"score\": 0.73914057, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Forbes\", \"url\": \"https://www.forbes.com/profile/mark-zuckerberg/\", \"content\": \"Meta CEO Mark Zuckerberg \\\\u201cloved\\\\u201d an image on Facebook known as \\\\\"Challah Horse\\\\\" that happens to be AI-generated, highlighting the amount of AI spam on the platform. ### Meta Donates $1 Million To Trump\\\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President Elect Meta has donated $1 million to President-elect Donald Trump\\\\u2019s inaugural fund, the company confirmed to various news outlets on Wednesday, a move that comes just weeks after its CEO Mark Zuckerberg met with Trump at his Mar-a-Lago residence in an apparent bid to mend years of strained ties. ### Meta Donates $1 Million To Trump\\\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President-Elect Read the full profile on Forbes: https://www.forbes.com/sites/kerryadolan/2023/09/26/mark-gets-meta-zuckerberg-talks-ai-and-that-musk-mma-fight-thats-never-going-to-happen/?sh=671046e73037\", \"score\": 0.6410185, \"raw_content\": null}]}')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.brave_search: 'brave_search'>, description='Search the web for information', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " current CEO of Meta is Mark Zuckerberg.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Search the web and tell me who the current CEO of Meta is.', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.brave_search: 'brave_search'>, description='Search the web for information', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "brave_search.call(query=\"current CEO of Meta\")",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "current CEO of Meta"
+              "tool_call": {
+                "arguments": {
+                  "celcius": "true",
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "ee5ac18d-de3b-4985-9e93-545de166d3e2",
+                "tool_name": "get_boiling_point_with_metadata"
               },
-              "call_id": "2039dce8-afbe-4517-bb4a-43c92dab8cff",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "brave_search"
-              }
+              "type": "tool_call"
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='What is the boiling point of polyjuice?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='get_boiling_point', arguments={'liquid_name': 'polyjuice', 'celcius': True})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='get_boiling_point', content='-100')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice='get_boiling_point', tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " boiling point of polyjuice is",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " -100\u00b0C.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='What is the boiling point of polyjuice?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='get_boiling_point', arguments={'liquid_name': 'polyjuice', 'celcius': True})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='get_boiling_point', content='-100')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)}), ToolDefinition(tool_name=<BuiltinTool.brave_search: 'brave_search'>, description='Search the web for information', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " boiling point of polyjuice is -100 degrees Celsius.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='What is the boiling point of polyjuice?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='get_boiling_point', arguments={'liquid_name': 'polyjuice', 'celcius': True})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='get_boiling_point', content='-100')])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.required: 'required'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " provided function \"get_boiling_point\" is",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " not sufficient to",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " answer the question as it does not contain information",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " about the boiling point of \"poly",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "juice\". Polyjuice is not a",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " real liquid and does",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " not have a known boiling point. If you",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " have any other questions or need",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " information about a different liquid,",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " I would be happy to try and",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " assist you.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='What is the boiling point of polyjuice?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice='get_boiling_point', tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "[",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "get",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "_boiling_point(liquid",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "_name='polyjuice', celcius=True)]",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
             },
-            "tool_call": {
-              "arguments": {
-                "celcius": true,
-                "liquid_name": "polyjuice"
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
               },
-              "call_id": "302993c2-3c56-48cf-8891-afac1f20723e",
-              "tool_name": "get_boiling_point"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='What is the boiling point of polyjuice?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)}), ToolDefinition(tool_name=<BuiltinTool.brave_search: 'brave_search'>, description='Search the web for information', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "[",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "get_boiling_point(liquid_name",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "='polyjuice', celcius=True)]",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "celcius": true,
-                "liquid_name": "polyjuice"
+              "metric": "prompt_tokens",
+              "span_id": "dsGyjpUB",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:45.316534+00:00",
+                "__module__": "datetime"
               },
-              "call_id": "9544e61b-5e69-427b-b30c-874fdbcf53f7",
-              "tool_name": "get_boiling_point"
+              "trace_id": "BO0etAZ6RFmGmLCW",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 37
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='What is the boiling point of polyjuice?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.none: 'none'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "Poly",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "juice is a fictional potion from",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the Harry Potter series by J.K. Rowling. As it",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "'s not a real substance, it doesn",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "'t have a boiling point. Polyjuice Potion is",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " a magical concoction that allows the drinker to assume the",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " form and appearance of another person, but it's not",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " a physical substance that can be measured or analyzed in the same",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " way as real-world chemicals.\n\nIf",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " you have any other questions or if there's anything else I can help you",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " with, feel free to ask!",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='What is the boiling point of polyjuice?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.required: 'required'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='get_boiling_point', description='Returns the boiling point of a liquid in Celcius or Fahrenheit', parameters={'liquid_name': ToolParamDefinition(param_type='string', description='The name of the liquid', required=True, default=None), 'celcius': ToolParamDefinition(param_type='bool', description='Whether to return the boiling point in Celcius', required=False, default=True)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "[",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "get_boiling_point(liquid_name='polyjuice",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "', celcius=True)]",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "celcius": true,
-                "liquid_name": "polyjuice"
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
               },
-              "call_id": "ce595f0c-86f3-4055-b675-09e00007dc97",
-              "tool_name": "get_boiling_point"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Write code and execute it to find the answer for: What is the 100th prime number?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'def is_prime(n):\\n    if n <= 1:\\n        return False\\n    if n <= 3:\\n        return True\\n    if n % 2 == 0 or n % 3 == 0:\\n        return False\\n    i = 5\\n    while i * i <= n:\\n        if n % i == 0 or n % (i + 2) == 0:\\n            return False\\n        i += 6\\n    return True\\n\\ndef nth_prime(n):\\n    count = 0\\n    num = 2\\n    while True:\\n        if is_prime(num):\\n            count += 1\\n            if count == n:\\n                return num\\n        num += 1\\n\\nprint(nth_prime(100))'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=<ToolPromptFormat.python_list: 'python_list'>, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', <ToolPromptFormat.python_list: 'python_list'>), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " 100th prime number is 541",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Write code and execute it to find the answer for: What is the 100th prime number?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, arguments={'code': 'def is_prime(n):\\n    if n <= 1:\\n        return False\\n    if n <= 3:\\n        return True\\n    if n % 2 == 0 or n % 3 == 0:\\n        return False\\n    i = 5\\n    while i * i <= n:\\n        if n % i == 0 or n % (i + 2) == 0:\\n            return False\\n        i += 6\\n    return True\\n\\ndef nth_prime(n):\\n    count = 0\\n    num = 2\\n    while True:\\n        if is_prime(num):\\n            count += 1\\n            if count == n:\\n                return num\\n        num += 1\\n\\nprint(nth_prime(100))'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, content=\"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\")])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " 100th prime number is 541",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ".",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='Write code and execute it to find the answer for: What is the 100th prime number?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "started"
-            },
-            "tool_call": "",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "def is_prime(n):\n    if n <= 1:\n        return False",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "\n    if n <=",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " ",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "3:\n        return True\n    if n % 2 == 0",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " or n % 3 == 0:\n        return False\n    i",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " = 5\n    while i * i <=",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " n:\n        if n % i == 0 or n % (i",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " + 2) == 0:\n            return False\n        i +=",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " 6\n    return True\n\ndef nth_prime(n):\n    count =",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " 0\n    num = 2\n    while True:\n        if",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": " is_prime(num):\n            count += 1\n            if count == n",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": ":\n                return num\n        num += 1\n\nprint(nth_prime",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "in_progress"
-            },
-            "tool_call": "(100))",
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "code": "def is_prime(n):\n    if n <= 1:\n        return False\n    if n <= 3:\n        return True\n    if n % 2 == 0 or n % 3 == 0:\n        return False\n    i = 5\n    while i * i <= n:\n        if n % i == 0 or n % (i + 2) == 0:\n            return False\n        i += 6\n    return True\n\ndef nth_prime(n):\n    count = 0\n    num = 2\n    while True:\n        if is_prime(num):\n            count += 1\n            if count == n:\n                return num\n        num += 1\n\nprint(nth_prime(100))"
+              "metric": "completion_tokens",
+              "span_id": "dsGyjpUB",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:45.316569+00:00",
+                "__module__": "datetime"
               },
-              "call_id": "63d06ce7-5266-4ee8-a620-0e81cf5108a1",
-              "tool_name": {
-                "__enum__": "BuiltinTool",
-                "value": "code_interpreter"
-              }
+              "trace_id": "BO0etAZ6RFmGmLCW",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
             },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was Perplexity the company founded?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'Perplexity the company founding date'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "Per",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "plexity the company was founded in 202",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "2.",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was Perplexity the company founded?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "[k",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "nowledge_search(query=\"Perplexity the company",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " founding date\")]",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "Perplexity the company founding date"
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
               },
-              "call_id": "3804eaba-07f8-448c-8dd4-8ee14d748a05",
-              "tool_name": "knowledge_search"
-            },
-            "type": "tool_call"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was the nba created?', context=None), CompletionMessage(role='assistant', content='', stop_reason=<StopReason.end_of_turn: 'end_of_turn'>, tool_calls=[ToolCall(call_id='<UUID>', tool_name='knowledge_search', arguments={'query': 'NBA creation date'})]), ToolResponseMessage(role='tool', call_id='<UUID>', tool_name='knowledge_search', content=[TextContentItem(type='text', text='knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n'), TextContentItem(type='text', text='Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n'), TextContentItem(type='text', text='Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='Result 3:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n'), TextContentItem(type='text', text='END of knowledge_search tool results.\\n')])])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "The",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " NBA was created on August 3, 1949, with",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " the merger of the Basketball",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": " Association of America (BAA) and the National Basketball League (NBL",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": ").",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
-      }
-    ],
-    "type": "generator"
-  },
-  "('meta-llama/Llama-3.3-70B-Instruct', [SystemMessage(role='system', content='You are a helpful assistant'), UserMessage(role='user', content='when was the nba created?', context=None)])_[('response_format', None), ('sampling_params', SamplingParams(strategy=TopPSamplingStrategy(type='top_p', temperature=0.0001, top_p=0.9), max_tokens=0, repetition_penalty=1.0)), ('stream', True), ('tool_config', ToolConfig(tool_choice=<ToolChoice.auto: 'auto'>, tool_prompt_format=None, system_message_behavior=<SystemMessageBehavior.append: 'append'>)), ('tool_prompt_format', None), ('tools', [ToolDefinition(tool_name='knowledge_search', description='Search for information in a database.', parameters={'query': ToolParamDefinition(param_type='string', description='The query to search for. Can be a natural language sentence or keywords.', required=True, default=None)}), ToolDefinition(tool_name=<BuiltinTool.code_interpreter: 'code_interpreter'>, description='Execute code', parameters={'code': ToolParamDefinition(param_type='string', description='The code to execute', required=True, default=None)})])]": {
-    "chunks": [
-      {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "start"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "[k",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "text": "nowledge_search(query=\"NBA creation date\")]",
-            "type": "text"
-          },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": null
-        },
-        "metrics": null
-      },
-      {
-        "event": {
-          "delta": {
-            "parse_status": {
-              "__enum__": "ToolCallParseStatus",
-              "value": "succeeded"
-            },
-            "tool_call": {
-              "arguments": {
-                "query": "NBA creation date"
+              "metric": "total_tokens",
+              "span_id": "dsGyjpUB",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:45.316576+00:00",
+                "__module__": "datetime"
               },
-              "call_id": "d94006c1-5692-4ada-8f1a-d09ef2d46dab",
-              "tool_name": "knowledge_search"
+              "trace_id": "BO0etAZ6RFmGmLCW",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 47
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Give me a sentence that contains the word: hello\", \"context\": null, \"role\": \"user\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": []}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
             },
-            "type": "tool_call"
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "progress"
-          },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
+          "metrics": null
+        }
       },
       {
-        "event": {
-          "delta": {
-            "text": "",
-            "type": "text"
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
           },
-          "event_type": {
-            "__enum__": "ChatCompletionResponseEventType",
-            "value": "complete"
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " customer smiled and said \"hello\" to the friendly store clerk.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
           },
-          "logprobs": null,
-          "stop_reason": {
-            "__enum__": "StopReason",
-            "value": "end_of_turn"
-          }
-        },
-        "metrics": null
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "R9a1QHt4",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:19.586300+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "t-ZRvSMzTCudL6SB",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 30
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "R9a1QHt4",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:19.586359+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "t-ZRvSMzTCudL6SB",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 24
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "R9a1QHt4",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:19.586367+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "t-ZRvSMzTCudL6SB",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 54
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\nprint(df.head())\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " error message indicates that the `bwrap.core",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "` module is not found. This",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " is because the `bwrap` module is not installed in",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " your Python environment.\n\nTo fix this issue,",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " you can use the `knowledge_search` function to describe the CSV",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " file. This function can be used to",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " search for information in a database, and it might have",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " access to information about the CSV file.\n\nHere is an example of",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " how you can use the `knowledge_search` function to describe the CSV",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " file:\n\n```\n{\n  \"type\": \"function\",\n  \"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "name\": \"knowledge_search\",\n  \"parameters\":",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " {\n    \"query\": \"Describe the CSV file at /var",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "/folders/cz/vyh7y1",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "d11xg881lsxsshnc5c0000gn/T",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "/tmpvto5j2dr/u8MQ2jywin",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "flation.csv\"\n  }\n}\n```",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "9UjZne1U",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:15.341367+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "cOvUfJZLSK2vci9f",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 149
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "9UjZne1U",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:15.341380+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "cOvUfJZLSK2vci9f",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 188
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "9UjZne1U",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:15.341383+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "cOvUfJZLSK2vci9f",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 337
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "import pandas as pd\ndf = pd",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ".read_csv(\"/var/folders/cz/vyh7y1",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "d11xg881lsxsshnc5c0000",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "gn/T/tmpvto5j2",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "dr/u8MQ2jywinflation.csv\")\nprint(df",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ".head())",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpvto5j2dr/u8MQ2jywinflation.csv\")\nprint(df.head())"
+                },
+                "call_id": "ecc9db21-332f-4931-8820-cf139f8a0b88",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "code_interpreter"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "6VEDipbd",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:14.030541+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "cOvUfJZLSK2vci9f",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 37
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "6VEDipbd",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:14.030577+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "cOvUfJZLSK2vci9f",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "6VEDipbd",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:14.030584+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "cOvUfJZLSK2vci9f",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 47
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If the file is too large to be uploaded, you can provide a sample of the csv file and I can help you describe it. \\n\\nHere is an example of how you can describe a csv file using pandas:\\n\\n```\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n# Print the first 5 rows of the data\\nprint(df.head())\\n# Print the last 5 rows of the data\\nprint(df.tail())\\n# Print the summary statistics of the data\\nprint(df.describe())\\n# Print the data types of each column\\nprint(df.dtypes)\\n# Print the number of missing values in each column\\nprint(df.isnull().sum())\\n```\\n\\nThis will give you an idea of what the csv file contains.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert date column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot the time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "This",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " code will create a line plot of the average yearly inflation over time",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ". The x-axis represents the year and the y-axis represents the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " average inflation. Each point on the plot represents the average inflation for",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " a particular year.\n\nPlease note that you need to replace 'in",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "flation.csv' with the actual path to your",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " csv file. Also, this code assumes that the csv file has a",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " column named 'date' and another column named 'inflation'. If your",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " csv file has different column names, you need to replace 'date' and",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " 'inflation' with the actual column names.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "Hm1BkrMQ",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:32:41.982115+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "T857cf9QSamVBOAy",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 636
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "Hm1BkrMQ",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:32:41.982147+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "T857cf9QSamVBOAy",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 126
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "Hm1BkrMQ",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:32:41.982153+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "T857cf9QSamVBOAy",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 762
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If the file is too large to be uploaded, you can provide a sample of the csv file and I can help you describe it. \\n\\nHere is an example of how you can describe a csv file using pandas:\\n\\n```\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n# Print the first 5 rows of the data\\nprint(df.head())\\n# Print the last 5 rows of the data\\nprint(df.tail())\\n# Print the summary statistics of the data\\nprint(df.describe())\\n# Print the data types of each column\\nprint(df.dtypes)\\n# Print the number of missing values in each column\\nprint(df.isnull().sum())\\n```\\n\\nThis will give you an idea of what the csv file contains.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " data\ndf = pd.read_csv('inflation.csv')\n\n#",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " Convert date column to datetime\ndf['date'] = pd.to",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "_datetime(df['date'])\n\n# Group by year and calculate average inflation",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\naverage_inflation = df.groupby(df['date'].dt.year",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ")['inflation'].mean()\n\n# Plot the time series\nplt",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ".figure(figsize=(10,6))\nplt.plot(average_inflation",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ".index, average_inflation.values, marker='o')\nplt.title",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "('Average Yearly Inflation')\nplt.xlabel('Year')\nplt",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ".ylabel('Average Inflation')\nplt.grid(True)\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "plt.show()",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv('inflation.csv')\n\n# Convert date column to datetime\ndf['date'] = pd.to_datetime(df['date'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\n\n# Plot the time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()"
+                },
+                "call_id": "4849f8b5-bbb8-4c7e-8f19-498dd559dbe2",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "code_interpreter"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "ZKjmS7HQ",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:32:30.999750+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "T857cf9QSamVBOAy",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 450
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "ZKjmS7HQ",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:32:30.999780+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "T857cf9QSamVBOAy",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "ZKjmS7HQ",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:32:30.999786+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "T857cf9QSamVBOAy",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 460
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a remote server or a local machine, you can use the `pd.read_csv()` function to load the csv file. \\n\\nHere is an example:\\n\\n```python\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n# Print the first 5 rows of the dataframe\\nprint(df.head())\\n# Print the summary of the dataframe\\nprint(df.info())\\nprint(df.describe())\\n```\\n\\nThis will print the first 5 rows of the dataframe, the summary of the dataframe (including the index dtype and column dtypes, non-nullable counts, and memory usage), and the descriptive statistics of the dataframe.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert 'date' column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot the time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "This",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " code will create a line plot of the average yearly inflation over time.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " The x-axis represents the year and the y-axis represents the average inflation.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " The plot also includes a title, labels for the x and y axes,",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " and a grid for better visibility.\n\nPlease note that you need to replace '",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "inflation.csv' with the actual path to",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " your csv file. Also, this code",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " assumes that the 'date' column in your",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " csv file is in a format that can be",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " parsed by pandas' `to_datetime` function. If your date",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " column is in a different format, you",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " may need to specify the format using the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " `format` parameter of `to_datetime`.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "Yv7iXXNJ",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:50.214420+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "srzTfsP6Sr-co-Ll",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 621
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "Yv7iXXNJ",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:50.214481+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "srzTfsP6Sr-co-Ll",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 143
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "Yv7iXXNJ",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:50.214490+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "srzTfsP6Sr-co-Ll",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 764
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a remote server or a local machine, you can use the `pd.read_csv()` function to load the csv file. \\n\\nHere is an example:\\n\\n```python\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n# Print the first 5 rows of the dataframe\\nprint(df.head())\\n# Print the summary of the dataframe\\nprint(df.info())\\nprint(df.describe())\\n```\\n\\nThis will print the first 5 rows of the dataframe, the summary of the dataframe (including the index dtype and column dtypes, non-nullable counts, and memory usage), and the descriptive statistics of the dataframe.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\ndf = pd.read_csv('inflation.csv')\n\n# Convert",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " 'date' column to datetime\ndf['date'] = pd.to",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "_datetime(df['date'])\n\n# Group by year and calculate average inflation\naverage",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "_inflation = df.groupby(df['date'].dt.year)['inflation'].",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "mean()\n\n# Plot the time series\nplt.figure(figsize=(10,6",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "))\nplt.plot(average_inflation.index, average_inflation.values, marker",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True)\nplt",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ".show()",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv('inflation.csv')\n\n# Convert 'date' column to datetime\ndf['date'] = pd.to_datetime(df['date'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\n\n# Plot the time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()"
+                },
+                "call_id": "62e5a10d-8a59-41e7-9f0e-87cabc7d15fa",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "code_interpreter"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "dv6g9n2H",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:48.391101+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "srzTfsP6Sr-co-Ll",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 433
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "dv6g9n2H",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:48.391113+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "srzTfsP6Sr-co-Ll",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "dv6g9n2H",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:48.391116+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "srzTfsP6Sr-co-Ll",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 443
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "It",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " seems that the file \"/var/folders/cz/vyh7",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "y1d11xg881lsxsshnc5c",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "0000gn/T/tmpvto5j",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "2dr/JwKzVg",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "5Ainflation.csv\" does not exist. \n\nTo describe the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " csv file, you need to provide the actual file path or the file itself",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ". If you are using a remote server or a local machine, you can",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use the `pd.read_csv()` function to load",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the csv file. \n\nHere is an example:\n\n```",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "python\nimport pandas as pd\n# Load data\ndf =",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " pd.read_csv('inflation.csv')\n# Print the first 5 rows",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " of the dataframe\nprint(df.head())\n# Print",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the summary of the dataframe\nprint(df.info())\nprint(df.describe",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "())\n```\n\nThis will print the first 5 rows of the dataframe",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ", the summary of the dataframe",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " (including the index dtype and column dtypes, non-nullable",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " counts, and memory usage), and the descriptive statistics of the dataframe.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "qV1E8nPK",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:41.439164+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "GG3oeA3qRH6WIf6Z",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 215
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "qV1E8nPK",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:41.439188+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "GG3oeA3qRH6WIf6Z",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 216
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "qV1E8nPK",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:41.439190+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "GG3oeA3qRH6WIf6Z",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 431
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "import pandas as pd\n# Load data\ndf = pd.read",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "_csv(\"/var/folders/cz/vyh7y1d11",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "xg881lsxsshnc5c0000gn/T/tmp",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "vto5j2dr/JwKzVg",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "5Ainflation.csv\")\n# Rows\nprint(\"Number",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " of rows and columns in the data:\", df.shape)\n# Columns",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\nprint(\"Columns of the data",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " are:\", len(df.columns))\n# Column names\nprint",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "(\"Columns of the data are:\", df.columns)\n# Column dt",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "ypes\nprint(\"Datatype of the columns are:\", df",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ".dtypes)",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpvto5j2dr/JwKzVg5Ainflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)"
+                },
+                "call_id": "87c3ef49-27e0-4561-ade3-83569a0fe236",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "code_interpreter"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "9OTP08Yr",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:39.830624+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "GG3oeA3qRH6WIf6Z",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 36
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "9OTP08Yr",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:39.830656+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "GG3oeA3qRH6WIf6Z",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "9OTP08Yr",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:39.830662+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "GG3oeA3qRH6WIf6Z",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 46
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:61fc5\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:af027\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:d5787\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:af027\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:d5787\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:af027\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:af027\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:af027\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:af027\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:af027\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "To",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use LoRA in Torchtune, you can follow these steps",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ":\n\n1.  Install Torchtune and its dependencies.\n2",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".  Download the Llama2 weights and tokenizer.\n3. ",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Use the `lora_llama2_7b` model in",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Torchtune, which applies LoRA to the Q and V projections",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " by default.\n4.  Load the base model weights into the Lo",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "RA model without any conversion necessary.\n5.  Set only",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " LoRA parameters to trainable.\n6.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "  Run the LoRA finetuning recipe in Torchtune",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " with the desired configuration.\n\nYou can also",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " experiment with different LoRA configurations, such as applying LoRA to",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " all linear layers in the self-attention, increasing the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " rank, or scaling alpha and rank",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " together.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "DfEa48OY",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:09.255488+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "XVSIgZRXR_aHBiAN",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 158
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "DfEa48OY",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:09.255559+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "XVSIgZRXR_aHBiAN",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 167
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "DfEa48OY",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:09.255562+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "XVSIgZRXR_aHBiAN",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 325
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:61fc5\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:af027\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:d5787\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:af027\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:d5787\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "knowledge_search\", \"parameters\": {\"query\": \"How",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " to use LoRA in Torchtune\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "How to use LoRA in Torchtune"
+                },
+                "call_id": "14b82c7e-18d4-4b46-8f07-442be700e8ae",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "DBZOtUux",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:58.136315+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "XVSIgZRXR_aHBiAN",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 117
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "DBZOtUux",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:58.136380+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "XVSIgZRXR_aHBiAN",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 40
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "DBZOtUux",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:58.136387+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "XVSIgZRXR_aHBiAN",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 157
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:61fc5\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:af027\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:d5787\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:af027\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:d5787\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'m ready to help you answer questions about Torchtune based on the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " documentation you provided. What's your first question?",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "gFK_4CQi",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:56.169962+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "A2oXFF9fRz2-Lc9N",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 75
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "gFK_4CQi",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:56.169995+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "A2oXFF9fRz2-Lc9N",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 35
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "gFK_4CQi",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:56.170001+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "A2oXFF9fRz2-Lc9N",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 110
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:78970\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:8404f\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:cbeb1\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:8404f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:cbeb1\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8404f\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:8404f\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:8404f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:8404f\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:8404f\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "To",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use LoRA in Torchtune, you can follow these steps:\n\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "1.  Install Torchtune and its",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " dependencies",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "2.  Download the Llama2 weights and tokenizer.\n3.  Use the `lora_llama2_7b` model in Torchtune, which applies LoRA to the Q and V projections by default.\n4.  Load the base",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " model weights into the LoRA model without any conversion necessary.\n5",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".  Set only LoRA parameters to trainable.\n6.  Run",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the LoRA finetuning recipe in Torchtune with the desired",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " configuration.\n\nYou can also experiment with different LoRA configurations, such as",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " applying LoRA to all linear layers in the self-attention, increasing",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the rank, or scaling alpha and rank together.\n\nNote that LoRA",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " can be beneficial for reducing memory usage during fine-tuning, but it may",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " also impact model performance. You can trade off memory and model performance by",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " adjusting the LoRA configuration and running experiments with different settings.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "Aw_FYSo-",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:33:57.305154+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "gd_zuJXnSaSfS3ZK",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 158
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "Aw_FYSo-",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:33:57.305251+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "gd_zuJXnSaSfS3ZK",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 212
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "Aw_FYSo-",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:33:57.305267+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "gd_zuJXnSaSfS3ZK",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 370
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:78970\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:8404f\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:cbeb1\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:8404f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:cbeb1\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters\":",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " {\"query\": \"How to use LoRA in Torchtune\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "How to use LoRA in Torchtune"
+                },
+                "call_id": "dc7dd9e0-6ca1-452e-bb62-532a09e71848",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "1iT28abM",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:33:53.948952+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "gd_zuJXnSaSfS3ZK",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 117
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "1iT28abM",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:33:53.949001+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "gd_zuJXnSaSfS3ZK",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 40
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "1iT28abM",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:33:53.949013+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "gd_zuJXnSaSfS3ZK",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 157
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:78970\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:8404f\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:cbeb1\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:8404f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:cbeb1\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'m ready to help you answer questions about Torchtune based on",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the documentation you provided. What's your",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " first question?",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "F3R1-xJM",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:33:52.280696+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "7Do839YJRHC_ADjC",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 75
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "F3R1-xJM",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:33:52.280743+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "7Do839YJRHC_ADjC",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 35
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "F3R1-xJM",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:33:52.280778+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "7Do839YJRHC_ADjC",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 110
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:78a41\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:7b4a7\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:531f2\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:7b4a7\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:531f2\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:7b4a7\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:7b4a7\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:7b4a7\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:7b4a7\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:7b4a7\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "To",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use LoRA in Torchtune, you can",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " follow these steps:\n\n1.  Install",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Torchtune and its dependencies.\n2",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".  Download the Llama2 weights and tokenizer.\n3. ",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Construct a Llama2 model with LoRA layers using the `l",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "ora_llama2_7b` function.\n4.  Load",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the base model weights into the LoRA model using the `load_state",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "_dict` method.\n5.  Set only LoRA parameters",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " to trainable using the `get_adapter_params` and `set_train",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "able_params` functions.\n6.  Run a LoRA finet",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "une using Torchtune's `Lo",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "RA recipe`.\n\nYou can also experiment",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " with different LoRA configurations, such as applying LoRA to",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " all linear layers in the self-attention, increasing the rank, or",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " scaling alpha.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "mR4ZUK-O",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:43.753366+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "c_UJ92LEQciFQx3T",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 158
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "mR4ZUK-O",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:43.753395+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "c_UJ92LEQciFQx3T",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 176
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "mR4ZUK-O",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:43.753399+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "c_UJ92LEQciFQx3T",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 334
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:78a41\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:7b4a7\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:531f2\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:7b4a7\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:531f2\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"knowledge_search",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\", \"parameters\": {\"query\": \"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "How to use LoRA in Torchtune\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "How to use LoRA in Torchtune"
+                },
+                "call_id": "721ea24f-be72-45fc-892c-aa7843f21ddf",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "VxsqbWot",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:42.471323+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "c_UJ92LEQciFQx3T",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 117
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "VxsqbWot",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:42.471354+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "c_UJ92LEQciFQx3T",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 40
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "VxsqbWot",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:42.471364+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "c_UJ92LEQciFQx3T",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 157
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:78a41\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:7b4a7\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:531f2\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:7b4a7\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:531f2\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'m ready to help you answer questions about",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Torchtune based on the documentation you provided. What's your",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " first question?",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "V87G94tT",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:40.786211+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "zdMkkXSDT0mK4qaK",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 75
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "V87G94tT",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:40.786377+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "zdMkkXSDT0mK4qaK",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 35
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "V87G94tT",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:40.786394+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "zdMkkXSDT0mK4qaK",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 110
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:d341f\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:900f3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:49640\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:900f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:49640\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:900f3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:900f3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:900f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:900f3\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:900f3\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "To",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use LoRA in Torchtune, you can follow these steps",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ":\n\n1.  Install Torchtune",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " and its dependencies.\n2. ",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Download the Llama2 weights and tokenizer",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".\n3.  Use the `lora_llama2_",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "7b` model in Torchtune, which applies LoRA",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " to the Q and V projections by default.\n4.  Load",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the base model weights into the LoRA model without any conversion",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " necessary.\n5.  Set only LoRA parameters to trainable",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".\n6.  Run the LoRA finetuning recipe in",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Torchtune with the desired configuration.\n\nYou",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " can also experiment with different LoRA configurations, such as applying Lo",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "RA to all linear layers in the self-attention, increasing the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " rank, or scaling alpha and rank together.\n\nNote that LoRA can",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " be beneficial for reducing memory usage during fine-tuning, but it may",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " also impact model performance. You can trade off memory and model performance",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " by adjusting the LoRA configuration and running experiments with different settings",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "zPIxK_rl",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:34:08.906834+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "fM03LVqrT7ufMvUA",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 158
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "zPIxK_rl",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:34:08.906934+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "fM03LVqrT7ufMvUA",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 212
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "zPIxK_rl",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:34:08.906949+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "fM03LVqrT7ufMvUA",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 370
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:d341f\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:900f3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:49640\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:900f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:49640\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"knowledge",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "_search\", \"parameters\": {\"query",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\": \"How to use LoRA in Torchtune\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "How to use LoRA in Torchtune"
+                },
+                "call_id": "38c8de4c-95b1-44b6-a685-c153631305d1",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "t7U94vaX",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:34:07.491116+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "fM03LVqrT7ufMvUA",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 117
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "t7U94vaX",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:34:07.491187+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "fM03LVqrT7ufMvUA",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 40
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "t7U94vaX",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:34:07.491195+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "fM03LVqrT7ufMvUA",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 157
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:d341f\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:900f3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:49640\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:900f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:49640\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'m ready to help you answer questions about Torchtune",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " based on the documentation you provided. What's your first question?",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "8iPkD4Fz",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:34:05.798649+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "JlE9DKp_RnCewBUu",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 75
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "8iPkD4Fz",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:34:05.798743+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "JlE9DKp_RnCewBUu",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 35
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "8iPkD4Fz",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:34:05.798759+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "JlE9DKp_RnCewBUu",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 110
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\": {\"query\": \"Torchtune documentation\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "Torchtune documentation"
+                },
+                "call_id": "b92c0200-4acb-4b6f-8ec7-2e2f993d6e1a",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "eANTdkZu",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:45.683600+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "A2oXFF9fRz2-Lc9N",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 39
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "eANTdkZu",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:45.683632+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "A2oXFF9fRz2-Lc9N",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "eANTdkZu",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:45.683639+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "A2oXFF9fRz2-Lc9N",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 49
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent:  instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\\n\\n\\n.. code-block:: bash\\n\\n    tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights <llama3_label>`\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n    <s>[INST] <<SYS>>\\n    You are a helpful, respectful, and honest assistant.\\n    <</SYS>>\\n\\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\\n\\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n    The Llama3 Base model uses a `different prompt template\\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n    Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n    sample = [\\n        {\\n            \\\"role\\\": \\\"system\\\",\\n            \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " attention type used by Llama3-8",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "B is grouped-query attention.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "l8TIu3wW",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:37.955798+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "rOU-VODXQUuIR6_p",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 80
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "l8TIu3wW",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:37.955879+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "rOU-VODXQUuIR6_p",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 26
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "l8TIu3wW",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:37.955889+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "rOU-VODXQUuIR6_p",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 106
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent:  instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\\n\\n\\n.. code-block:: bash\\n\\n    tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights <llama3_label>`\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n    <s>[INST] <<SYS>>\\n    You are a helpful, respectful, and honest assistant.\\n    <</SYS>>\\n\\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\\n\\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n    The Llama3 Base model uses a `different prompt template\\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n    Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n    sample = [\\n        {\\n            \\\"role\\\": \\\"system\\\",\\n            \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " attention type used by Llama3-8B is grouped-query attention.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "Ihnuyt_Y",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:24.902478+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "6eJM3WR0QsyIiMfg",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 80
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "Ihnuyt_Y",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:24.902491+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "6eJM3WR0QsyIiMfg",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 26
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "Ihnuyt_Y",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:24.902493+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "6eJM3WR0QsyIiMfg",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 106
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "    \"type\": \"function\",\n    \"name\": \"knowledge_search",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\",\n    \"parameters\": {\n        \"query\": \"Llama3-",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "8B attention type\"\n    }\n}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "Llama3-8B attention type"
+                },
+                "call_id": "0af9e857-510d-4df8-872f-51b520578c22",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "b4C_3cNl",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:27.116730+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "rOU-VODXQUuIR6_p",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 40
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "b4C_3cNl",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:27.116756+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "rOU-VODXQUuIR6_p",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 48
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "b4C_3cNl",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:27.116762+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "rOU-VODXQUuIR6_p",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 88
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "parameters\": {\"query\": \"Llama3-8B attention type",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "Llama3-8B attention type"
+                },
+                "call_id": "69cc8903-d256-40bb-aa1e-7f3935986e49",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "05SrG-G4",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:24.286222+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "6eJM3WR0QsyIiMfg",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 40
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "05SrG-G4",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:24.286242+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "6eJM3WR0QsyIiMfg",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "05SrG-G4",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:24.286244+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "6eJM3WR0QsyIiMfg",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 50
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.8190992, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79099923, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg: Founder and CEO of Meta (formerly Facebook) - Investopedia\\\", \\\"url\\\": \\\"https://www.investopedia.com/terms/m/mark-zuckerberg.asp\\\", \\\"content\\\": \\\"Mark Zuckerberg: Founder and CEO of Meta (formerly Facebook) Mark Zuckerberg: Founder and CEO of Meta (formerly Facebook) Mark Zuckerberg is a self-taught computer programmer and co-founder, chair, and chief executive officer of Meta (META), formerly known as Facebook. Mark Zuckerberg is a self-taught computer programmer and the co-founder, chair, and CEO of Meta (formerly Facebook). In April 2018, Zuckerberg testified on Capitol Hill about Facebook's use of users' information, including the sharing of 87 million users' information to Cambridge Analytica. Technically, Mark Zuckerberg makes a salary of $1 a year at Facebook. Booker Join With Facebook Founder and CEO Mark Zuckerberg to Advance a National Model for Improving Public Schools.\\\\\\\"\\\", \\\"score\\\": 0.74697095, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Forbes\\\", \\\"url\\\": \\\"https://www.forbes.com/profile/mark-zuckerberg/\\\", \\\"content\\\": \\\"Meta CEO Mark Zuckerberg \\\\u201cloved\\\\u201d an image on Facebook known as \\\\\\\"Challah Horse\\\\\\\" that happens to be AI-generated, highlighting the amount of AI spam on the platform. ### Meta Donates $1 Million To Trump\\\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President Elect Meta has donated $1 million to President-elect Donald Trump\\\\u2019s inaugural fund, the company confirmed to various news outlets on Wednesday, a move that comes just weeks after its CEO Mark Zuckerberg met with Trump at his Mar-a-Lago residence in an apparent bid to mend years of strained ties. ### Meta Donates $1 Million To Trump\\\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President-Elect Read the full profile on Forbes: https://www.forbes.com/sites/kerryadolan/2023/09/26/mark-gets-meta-zuckerberg-talks-ai-and-that-musk-mma-fight-thats-never-going-to-happen/?sh=671046e73037\\\", \\\"score\\\": 0.6410185, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " current CEO of Meta is Mark Zuckerberg.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "HyrnM7Qp",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:30.044240+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "7cHuamFcQay638rC",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 1203
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "HyrnM7Qp",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:30.044278+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "7cHuamFcQay638rC",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 19
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "HyrnM7Qp",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:30.044287+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "7cHuamFcQay638rC",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 1222
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "brave_search.call(query=\"current CEO of Meta\")",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "current CEO of Meta"
+                },
+                "call_id": "a4d59df1-70b9-4f99-84ea-aa3a103b82ad",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "brave_search"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "jOaA28AT",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:21.259444+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "7cHuamFcQay638rC",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 34
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "jOaA28AT",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:21.259478+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "7cHuamFcQay638rC",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "jOaA28AT",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:21.259485+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "7cHuamFcQay638rC",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 44
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " function `get_boiling_point` is not able to find the boiling point",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " of polyjuice as it is a fictional liquid from the Harry Potter series",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ". The function is only able to find the boiling point of real liquids.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "hmXLMi0u",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:14.642967+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "-Go8XWSYSRG2j2Ea",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 70
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "hmXLMi0u",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:14.642981+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "-Go8XWSYSRG2j2Ea",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 56
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "hmXLMi0u",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:14.642984+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "-Go8XWSYSRG2j2Ea",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 126
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " function `get_boiling_point` is",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " not able to find the boiling point of polyjuice as",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " it is not a real liquid.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "ttsui3ip",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:53.513474+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "p1tRy8A3Q7KFFDLH",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 70
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "ttsui3ip",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:53.513507+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "p1tRy8A3Q7KFFDLH",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 38
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "ttsui3ip",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:53.513514+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "p1tRy8A3Q7KFFDLH",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 108
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " function `get_boiling_point` is not able to find the boiling point",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " of polyjuice as it is not a real liquid.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "nUJGFTmQ",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:07.133674+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "Xtf06INCSmyxkwGf",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 70
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "nUJGFTmQ",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:07.133708+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "Xtf06INCSmyxkwGf",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 38
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "nUJGFTmQ",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:07.133715+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "Xtf06INCSmyxkwGf",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 108
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\", \"parameters\": {\"liquid_name\": \"",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "polyjuice\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "1e925ff5-d0b8-4b87-b3c3-a1a36f69626d",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "OG8Jlmhk",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:10.868586+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "KgDQc2UfSrau2dZD",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 30
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "OG8Jlmhk",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:10.868615+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "KgDQc2UfSrau2dZD",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "OG8Jlmhk",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:10.868621+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "KgDQc2UfSrau2dZD",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 40
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"polyjuice",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "5721b667-748d-4e14-953c-ec67ad2aa152",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "mmWnwqPx",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:51.740989+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "i8h2T9ZHRMiTL0YG",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 30
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "mmWnwqPx",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:51.741006+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "i8h2T9ZHRMiTL0YG",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "mmWnwqPx",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:51.741009+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "i8h2T9ZHRMiTL0YG",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 40
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"none\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " couldn't find any information on the boiling point of Polyjuice. Polyju",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "ice is a magical potion in the Harry Potter series that allows the drinker",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " to transform into someone else. It's not a physical substance",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " with a boiling point. If you have",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " any other questions, I'd be happy to help.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "_CvLa4Gk",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:09.509742+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "GUkufTl4SZSHCyBF",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 30
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "_CvLa4Gk",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:09.509773+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "GUkufTl4SZSHCyBF",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 73
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "_CvLa4Gk",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:09.509780+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "GUkufTl4SZSHCyBF",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 103
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point\",",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " \"parameters\": {\"liquid_name\": \"polyjuice\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "7208784f-0e3f-4ae5-933b-7cc96b2d9375",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "MiP-_LQE",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:04.875000+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "3_z5Yy0wStST3JAm",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 30
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "MiP-_LQE",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:04.875027+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "3_z5Yy0wStST3JAm",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "MiP-_LQE",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:41:04.875032+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "3_z5Yy0wStST3JAm",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 40
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"def is_prime(n):\\n    if n <= 1:\\n        return False\\n    if n <= 3:\\n        return True\\n    if n % 2 == 0 or n % 3 == 0:\\n        return False\\n    i = 5\\n    while i * i <= n:\\n        if n % i == 0 or n % (i + 2) == 0:\\n            return False\\n        i += 6\\n    return True\\n\\ndef get_nth_prime(n):\\n    count = 0\\n    num = 2\\n    while True:\\n        if is_prime(num):\\n            count += 1\\n            if count == n:\\n                return num\\n        num += 1\\n\\nprint(get_nth_prime(100))\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " 100th prime number is 541.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "1eo6b4br",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:38.093912+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "PA3C-YQ-RtaWHr7k",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 251
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "1eo6b4br",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:38.093946+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "PA3C-YQ-RtaWHr7k",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 20
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "1eo6b4br",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:38.093956+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "PA3C-YQ-RtaWHr7k",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 271
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "def is_prime(n):\n    if n <= 1:\n       ",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " return False\n    if n <= 3",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ":\n        return True\n    if n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " % 2 == 0 or n % 3 ==",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " 0:\n        return False\n    i = 5\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "    while i * i <= n:\n       ",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " if n % i == 0 or n % (i",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " + 2) == 0:\n            return False\n       ",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " i += 6\n    return True\n\ndef get_nth_prime",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "(n):\n    count = 0\n    num = 2\n    while",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " True:\n        if is_prime(num):\n            count += 1\n            if",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " count == n:\n                return num\n        num += 1\n\nprint",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "(get_nth_prime(100))",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "code": "def is_prime(n):\n    if n <= 1:\n        return False\n    if n <= 3:\n        return True\n    if n % 2 == 0 or n % 3 == 0:\n        return False\n    i = 5\n    while i * i <= n:\n        if n % i == 0 or n % (i + 2) == 0:\n            return False\n        i += 6\n    return True\n\ndef get_nth_prime(n):\n    count = 0\n    num = 2\n    while True:\n        if is_prime(num):\n            count += 1\n            if count == n:\n                return num\n        num += 1\n\nprint(get_nth_prime(100))"
+                },
+                "call_id": "6e8a3719-a151-4f66-bee2-416bb262b9ad",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "code_interpreter"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "ONk3SjW9",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:37.386737+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "PA3C-YQ-RtaWHr7k",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 40
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "ONk3SjW9",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:37.386768+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "PA3C-YQ-RtaWHr7k",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "ONk3SjW9",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:40:37.386775+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "PA3C-YQ-RtaWHr7k",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 50
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity company founding date\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity company founding date\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "Per",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "plexity the company was founded in 2022.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "vFe6LmM2",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:18.095687+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "1TSzhwWfQVaTaa-W",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 105
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "vFe6LmM2",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:18.095731+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "1TSzhwWfQVaTaa-W",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 22
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "vFe6LmM2",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:18.095738+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "1TSzhwWfQVaTaa-W",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 127
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity company founding date\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"knowledge",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "_search\", \"parameters\": {\"query\": \"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "Perplexity company founding date\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "Perplexity company founding date"
+                },
+                "call_id": "d631bb54-a82b-43c2-a2ad-cfb6f137a30c",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "o0vtaC1m",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:17.530116+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "1TSzhwWfQVaTaa-W",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 67
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "o0vtaC1m",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:17.530143+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "1TSzhwWfQVaTaa-W",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 37
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "o0vtaC1m",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:17.530149+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "1TSzhwWfQVaTaa-W",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 104
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\":",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " \"knowledge_search\", \"parameters\": {\"query\": \"Perplexity",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " company founding date\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "Perplexity company founding date"
+                },
+                "call_id": "fdd3b71b-9608-4e31-b2dc-4019d5732c9c",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "pP3mZKZI",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:16.766858+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "1TSzhwWfQVaTaa-W",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 29
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "pP3mZKZI",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:16.766887+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "1TSzhwWfQVaTaa-W",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "pP3mZKZI",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:16.766890+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "1TSzhwWfQVaTaa-W",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 39
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"when was the nba created\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"when was the nba created\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " NBA was created on August 3, 1949, with",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the merger of the Basketball Association of America (BAA) and",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the National Basketball League (NBL).",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "2IUoADvp",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:20.625791+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "_7bSgNpLRmSbHN6U",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 103
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "2IUoADvp",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:20.625819+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "_7bSgNpLRmSbHN6U",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 45
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "2IUoADvp",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:20.625827+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "_7bSgNpLRmSbHN6U",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 148
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"when was the nba created\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\": {\"query\": \"when was the nba created\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "when was the nba created"
+                },
+                "call_id": "0c671028-deee-4ee8-95bd-5aec474c1ac9",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "bY3DnNes",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:20.197499+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "_7bSgNpLRmSbHN6U",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 65
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "bY3DnNes",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:20.197531+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "_7bSgNpLRmSbHN6U",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 37
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "bY3DnNes",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:20.197538+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "_7bSgNpLRmSbHN6U",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 102
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}]]_{\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " \"parameters\": {\"query\": \"when was the nba created\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "when was the nba created"
+                },
+                "call_id": "92a4755c-66e1-43bb-ac4b-cb63109591e7",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "_lkO0yBc",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:19.550197+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "_7bSgNpLRmSbHN6U",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 27
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "_lkO0yBc",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:19.550227+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "_7bSgNpLRmSbHN6U",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "_lkO0yBc",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:42:19.550235+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "_7bSgNpLRmSbHN6U",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 37
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"false\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " boiling point of polyjuice is -100 degrees Fahrenheit.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "ehKvLn9e",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:07.946658+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "gYfhKRXmT0qqnh4V",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 139
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "ehKvLn9e",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:07.946690+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "gYfhKRXmT0qqnh4V",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 23
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "ehKvLn9e",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:07.946698+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "gYfhKRXmT0qqnh4V",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 162
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"get_boiling_point\", \"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "parameters\": {\"liquid_name\": \"polyju",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "ice\", \"celcius\": \"false\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "celcius": "false",
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "ccb7e766-3cbd-4cd1-ac24-7d59fdbd32dd",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "f8N9xscj",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:06.326554+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "pbTGwscoS2O-TOD7",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 91
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "f8N9xscj",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:06.326581+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "pbTGwscoS2O-TOD7",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 45
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "f8N9xscj",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:06.326587+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "pbTGwscoS2O-TOD7",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 136
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "_point\", \"parameters\": {\"liquid_name\":",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " \"polyjuice\", \"celcius\": \"true\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "celcius": "true",
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "78adc0b9-cd6a-4052-b434-1db332fac11f",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "4ZGPgl-J",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:55.006558+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "0JdU31UqRW6uyUfy",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 43
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "4ZGPgl-J",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:55.006570+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "0JdU31UqRW6uyUfy",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "4ZGPgl-J",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:55.006572+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "0JdU31UqRW6uyUfy",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 53
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " boiling point of polyjuice is -100",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\u00b0C.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "TRGdCKiq",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:38.684993+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "yO1YOhixQ9mpO4rb",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 85
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "TRGdCKiq",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:38.685019+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "yO1YOhixQ9mpO4rb",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 22
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "TRGdCKiq",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:38.685025+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "yO1YOhixQ9mpO4rb",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 107
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " boiling point of polyjuice is -100\u00b0C.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "lHrhiQgT",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:39.714686+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "0jyTQ_JVTyO8Fz_O",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 87
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "lHrhiQgT",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:39.714720+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "0jyTQ_JVTyO8Fz_O",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 22
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "lHrhiQgT",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:39.714727+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "0jyTQ_JVTyO8Fz_O",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 109
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\", \"parameters\": {\"liquid_name\": \"polyjuice\", \"cel",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "cius\": \"true\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "celcius": "true",
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "ec5e1671-d607-46ae-804b-4f15e42e51b2",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "GbmO2wcg",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:38.172673+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "Fquzg9P5RfSrqSeH",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 37
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "GbmO2wcg",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:38.172704+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "Fquzg9P5RfSrqSeH",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "GbmO2wcg",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:38.172712+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "Fquzg9P5RfSrqSeH",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 47
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\": \"get_boiling_point_with_metadata\", \"parameters\": {\"",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "liquid_name\": \"polyjuice\", \"celcius\":",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " \"true\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "celcius": "true",
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "1f6ad98b-871e-43fd-a866-53f54acb9466",
+                "tool_name": "get_boiling_point_with_metadata"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "gn-gDCYG",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:39.300170+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "U3gRmVfKQK6UkwCL",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 37
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "gn-gDCYG",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:39.300210+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "U3gRmVfKQK6UkwCL",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "gn-gDCYG",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:39.300222+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "U3gRmVfKQK6UkwCL",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 47
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Give me a sentence that contains the word: hello\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": []}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " customer smiled and said \"hello\" to the friendly store clerk.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "V_N39zVn",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:05.597771+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "S-YEXTxAQyqX6Sbg",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 30
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "V_N39zVn",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:05.597811+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "S-YEXTxAQyqX6Sbg",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 24
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "V_N39zVn",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:05.597818+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "S-YEXTxAQyqX6Sbg",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 54
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'m unable to run the code as I'm missing the `b",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "wrap.core` module. However, I can provide a general solution",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " for you.\n\nTo describe a CSV",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " file, you can use the `pandas` library in Python.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Here's a general solution:\n\n1.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Import the `pandas` library.\n2. Load the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " CSV file using `pd.read_csv()`.\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "3. Print the first few rows of the dataframe using `df",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".head()`.\n4. Print the data types of each",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " column using `df.dtypes`.\n5. Print the summary",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " statistics of the dataframe using `df.describe()`.\n\nThis will give",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " you a general idea of what the CSV file contains. If you",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " need more specific information, please let me know and I'll be",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " happy to help.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "uKno8S5o",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:19.978994+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "qchwuhR3TlCRLUu5",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 355
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "uKno8S5o",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:19.979047+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "qchwuhR3TlCRLUu5",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 166
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "uKno8S5o",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:19.979054+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "qchwuhR3TlCRLUu5",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 521
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "import pandas as pd\nimport code_interpreter\n\n# Load the",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " CSV file\ndf = pd.read_csv(\"/var/folders/c",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "z/vyh7y1d11xg881lsxssh",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "nc5c0000gn/T/tmplr_wf0lb",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "/Pl4Pewubinflation.csv\")\n\n# Print the first few",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " rows of the dataframe\nprint(df.head())\n\n# Print the data types of",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " each column\nprint(df.dtypes)\n\n# Print the summary statistics of the",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " dataframe\nprint(df.describe())",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmplr_wf0lb/Pl4Pewubinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())"
+                },
+                "call_id": "40ed30d4-05c7-4a7f-93b0-e1e6e43e48de",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "code_interpreter"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "sz886Glf",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:18.831808+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "qchwuhR3TlCRLUu5",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 196
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "sz886Glf",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:18.831870+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "qchwuhR3TlCRLUu5",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "sz886Glf",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:18.831879+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "qchwuhR3TlCRLUu5",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 206
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "import pandas as pd\nimport code_interpreter\n\n# Load the",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " CSV file\ndf = pd.read_csv(\"/var/folders/cz",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "/vyh7y1d11xg881",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "lsxsshnc5c0000gn/T/tmplr",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "_wf0lb/Pl4Pewubinflation.csv\")\n\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "# Print the first few rows of the dataframe\nprint(df.head",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "())\n\n# Print the data types of each column\nprint(df.d",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "types)\n\n# Print the summary statistics of the",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " dataframe\nprint(df.describe())",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmplr_wf0lb/Pl4Pewubinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())"
+                },
+                "call_id": "0a037488-ab9e-46e9-bdc4-7ee6f9ef0e1e",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "code_interpreter"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "NoDjls_F",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:17.910457+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "qchwuhR3TlCRLUu5",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 37
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "NoDjls_F",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:17.910513+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "qchwuhR3TlCRLUu5",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "NoDjls_F",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:17.910522+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "qchwuhR3TlCRLUu5",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 47
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a local file, you can use the `load_data` function from the `code_interpreter` library to load the file. \\n\\nHere is an example of how you can describe the csv file:\\n\\n```\\nimport pandas as pd\\nfrom code_interpreter import load_data\\n\\n# Load data\\ndf = load_data('inflation.csv')\\n\\n# Print summary of the data\\nprint(df.head())  # Print the first few rows of the data\\nprint(df.info())  # Print information about the data\\nprint(df.describe())  # Print summary statistics about the data\\n```\\n\\nPlease replace 'inflation.csv' with your actual csv file name. \\n\\nIf you are using a remote file, you need to provide the actual file path or the file itself. \\n\\nAlso, make sure that the file is in the correct format and that the pandas library can read it correctly.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"inflation.csv\\\")\\n\\n# Convert date column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "This",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " code will create a line plot of the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " average yearly inflation over time. The x",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "-axis represents the year and the y-axis represents the average inflation",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ". The plot will also include a title, labels for the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " x and y axes, and a grid to make it easier",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " to read.\n\nPlease replace \"inflation.csv\" with your",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " actual csv file name. \n\nAlso, make sure that the file",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " is in the correct format and that the pandas library can read it",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " correctly. \n\nIf your csv file has a different column name for",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the date, you will need to replace",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " 'date' with the actual column name. \n\nIf your csv",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " file has a different column name for the inflation, you will need",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " to replace 'inflation' with the actual column name. \n\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "If you want to save the plot to a file instead of displaying",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " it, you can use the `savefig` method. For",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " example:\n\n```\nplt.savefig('average_inflation.png')\n```",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "2Yx8i0id",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:51.132007+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "N2BeNv66RcO7NRuE",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 666
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "2Yx8i0id",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:51.132048+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "N2BeNv66RcO7NRuE",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 200
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "2Yx8i0id",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:51.132054+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "N2BeNv66RcO7NRuE",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 866
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a local file, you can use the `load_data` function from the `code_interpreter` library to load the file. \\n\\nHere is an example of how you can describe the csv file:\\n\\n```\\nimport pandas as pd\\nfrom code_interpreter import load_data\\n\\n# Load data\\ndf = load_data('inflation.csv')\\n\\n# Print summary of the data\\nprint(df.head())  # Print the first few rows of the data\\nprint(df.info())  # Print information about the data\\nprint(df.describe())  # Print summary statistics about the data\\n```\\n\\nPlease replace 'inflation.csv' with your actual csv file name. \\n\\nIf you are using a remote file, you need to provide the actual file path or the file itself. \\n\\nAlso, make sure that the file is in the correct format and that the pandas library can read it correctly.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " data\ndf = pd.read_csv(\"",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "inflation.csv\")\n\n# Convert date column to datetime\ndf",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "['date'] = pd.to_datetime(df['date'])\n\n# Group",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " by year and calculate average inflation\naverage_inflation = df.groupby",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "(df['date'].dt.year)['inflation'].mean()\n\n#",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " Plot average yearly inflation as a time series\nplt.figure(figsize=(",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "10,6))\nplt.plot(average_in",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "flation.index, average_inflation.values, marker='o')\nplt",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ".title('Average Yearly Inflation')\nplt.xlabel('Year')\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "plt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "()",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"inflation.csv\")\n\n# Convert date column to datetime\ndf['date'] = pd.to_datetime(df['date'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\n\n# Plot average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()"
+                },
+                "call_id": "cfae3ff5-49f8-439d-b740-603bc93fb5a3",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "code_interpreter"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "JNrmlTTc",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:39.920493+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "N2BeNv66RcO7NRuE",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 476
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "JNrmlTTc",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:39.920519+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "N2BeNv66RcO7NRuE",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "JNrmlTTc",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:39.920522+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "N2BeNv66RcO7NRuE",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 486
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "It",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " seems that the file \"/var/folders",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "/cz/vyh7y1d11xg881lsx",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "sshnc5c0000gn/T/t",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "mplr_wf0lb/p99E",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "7wY2inflation.csv\" does not exist. \n\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "To describe the csv file, you need to provide the actual file",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " path or the file itself. If you are using a local file",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ", you can use the `load_data` function from the `",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "code_interpreter` library to load the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " file. \n\nHere is an example of how you can describe",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the csv file:\n\n```\nimport pandas as",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " pd\nfrom code_interpreter import load_data\n\n# Load data",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\ndf = load_data('inflation.csv')\n\n# Print summary of",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the data\nprint(df.head())  #",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Print the first few rows of the data\nprint(df.info())",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "  # Print information about the data\nprint(df.describe()) ",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " # Print summary statistics about the data\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "```\n\nPlease replace 'inflation.csv",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "' with your actual csv file name.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " \n\nIf you are using a remote file, you need to provide",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the actual file path or the file itself.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " \n\nAlso, make sure that the file is in the correct format",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " and that the pandas library can read it correctly.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "rE7rhw1s",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:30.946947+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "RPZJ19J7SzaX6t6h",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 213
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "rE7rhw1s",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:30.946979+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "RPZJ19J7SzaX6t6h",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 261
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "rE7rhw1s",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:30.946982+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "RPZJ19J7SzaX6t6h",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 474
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "import pandas as pd\n# Load data\ndf = pd.read",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "_csv(\"/var/folders/cz/vyh7y1d",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "11xg881lsxsshnc5c0000gn/T",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "/tmplr_wf0lb/p99E7wY2",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "inflation.csv\")\n#",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " Rows\nprint(\"Number of rows and columns in the",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " data:\", df.shape)\n# Columns\nprint(\"Columns of",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " the data are:\", len(df.columns))\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "# Column names\nprint(\"Columns of",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " the data are:\", df.columns)\n# Column dtypes\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "print(\"Datatype of the columns are:\", df.dtypes)",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmplr_wf0lb/p99E7wY2inflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)"
+                },
+                "call_id": "1db58db0-92c5-4e65-8e83-631bef020ef4",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "code_interpreter"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "W_qnYIUI",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:29.106322+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "RPZJ19J7SzaX6t6h",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 36
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "W_qnYIUI",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:29.106333+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "RPZJ19J7SzaX6t6h",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "W_qnYIUI",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:29.106336+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "RPZJ19J7SzaX6t6h",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 46
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8c1f5\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:13786\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:f9c19\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:13786\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:f9c19\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:13786\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:13786\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:13786\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:13786\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:13786\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "To",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use LoRA in Torchtune, you can",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " follow these steps:\n\n1.  Install",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Torchtune and its dependencies.\n2",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".  Download the Llama2 weights and tokenizer.\n3.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "  Use the `lora_llama2_7b`",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " model in Torchtune, which applies",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " LoRA to the Q and V projections by default.\n4.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "  Load the base model weights into the LoRA model without any",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " conversion necessary.\n5.  Set only LoRA parameters to",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " trainable.\n6.  Run the LoRA finetuning recipe",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " in Torchtune with the desired configuration.\n\nYou can also experiment",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " with different LoRA configurations, such as applying LoRA to all",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " linear layers in the self-attention, increasing the rank, or",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " scaling alpha and rank together.\n\nBy following these steps,",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " you can use LoRA in Torchtune to",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " fine-tune a Llama2 model with a",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " low memory footprint and achieve good performance.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "fHo5RmyV",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:04.498360+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "NIVx0ka-TmKDiZaU",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 158
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "fHo5RmyV",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:04.498396+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "NIVx0ka-TmKDiZaU",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 200
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "fHo5RmyV",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:04.498403+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "NIVx0ka-TmKDiZaU",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 358
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8c1f5\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:13786\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:f9c19\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:13786\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:f9c19\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "name\": \"knowledge_search\", \"parameters",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\": {\"query\": \"How to use LoRA in Torcht",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "une\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "How to use LoRA in Torchtune"
+                },
+                "call_id": "7815c1ab-fbdf-42e8-84a7-b1f74f67d863",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "KM-vILDG",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:01.270069+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "NIVx0ka-TmKDiZaU",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 117
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "KM-vILDG",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:01.270143+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "NIVx0ka-TmKDiZaU",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 40
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "KM-vILDG",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:01.270151+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "NIVx0ka-TmKDiZaU",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 157
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8c1f5\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:13786\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:f9c19\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:13786\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:f9c19\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'m ready to help you answer questions about Torcht",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "une based on the documentation you provided. What's your first",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " question?",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "5yc3Hts6",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:59.857021+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "6KRztpbwTwquLEUn",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 75
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "5yc3Hts6",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:59.857048+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "6KRztpbwTwquLEUn",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 35
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "5yc3Hts6",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:59.857055+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "6KRztpbwTwquLEUn",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 110
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:b222e\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:1b69d\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:deca9\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:1b69d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:deca9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:1b69d\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:1b69d\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:1b69d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:1b69d\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:1b69d\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "To",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use LoRA in Torchtune, you can follow these steps:\n\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "1.  Install Torchtune and its dependencies",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".\n2.  Download the Llama2 weights and",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " tokenizer.\n3.  Construct a",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Llama2 model with LoRA layers using `lora_ll",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "ama2_7b`.\n4.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "  Load the base model weights into the LoRA model without",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " any conversion necessary.\n5.  Set only LoRA parameters to",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " trainable.\n6.  Run a LoRA finetune using",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Torchtune's `LoRA recipe`.\n\nYou can also experiment",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " with different LoRA configurations, such as applying LoRA to all",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " linear layers in the self-attention,",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " increasing the rank to 16 or ",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "32, and scaling alpha and rank together.\n\nNote that LoRA",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " can be beneficial for reducing memory usage during fine-tuning",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ", but it may also impact model performance",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ". You can trade off memory and model",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " performance by adjusting the LoRA configuration.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "BHazvRV1",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:10.165627+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "1NwedpozRqOVQXRs",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 158
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "BHazvRV1",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:10.165662+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "1NwedpozRqOVQXRs",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 202
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "BHazvRV1",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:10.165670+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "1NwedpozRqOVQXRs",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 360
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:b222e\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:1b69d\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:deca9\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:1b69d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:deca9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"knowledge_search\", \"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "parameters\": {\"query\": \"How to use LoRA in Tor",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "chtune\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "How to use LoRA in Torchtune"
+                },
+                "call_id": "c92271a7-37e2-4396-aa7f-5805b9273a71",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "Z6HS-lIg",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:08.648346+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "1NwedpozRqOVQXRs",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 117
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "Z6HS-lIg",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:08.648375+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "1NwedpozRqOVQXRs",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 40
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "Z6HS-lIg",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:08.648382+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "1NwedpozRqOVQXRs",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 157
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:b222e\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:1b69d\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:deca9\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:1b69d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:deca9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'m ready to help you answer questions about",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Torchtune based on the documentation you provided. What's your",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " first question?",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "o33PSCts",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:07.268876+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "edTwKHK5Q4K8yCqt",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 75
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "o33PSCts",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:07.268906+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "edTwKHK5Q4K8yCqt",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 35
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "o33PSCts",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:07.268914+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "edTwKHK5Q4K8yCqt",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 110
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\": {\"query\": \"Torchtune documentation\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "Torchtune documentation"
+                },
+                "call_id": "26bf5efc-c1da-4229-86d9-853f45d3a0f6",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "UUPCfOjW",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:06.661392+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "edTwKHK5Q4K8yCqt",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 39
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "UUPCfOjW",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:06.661422+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "edTwKHK5Q4K8yCqt",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "UUPCfOjW",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:06.663497+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "edTwKHK5Q4K8yCqt",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 49
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent:  instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\\n\\n\\n.. code-block:: bash\\n\\n    tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights <llama3_label>`\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n    <s>[INST] <<SYS>>\\n    You are a helpful, respectful, and honest assistant.\\n    <</SYS>>\\n\\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\\n\\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n    The Llama3 Base model uses a `different prompt template\\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n    Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n    sample = [\\n        {\\n            \\\"role\\\": \\\"system\\\",\\n            \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " attention type used",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " by Llama3-8B is grouped-query attention.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "qzbGsIc-",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:56.822860+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "5LMJTs_wRBiwAPaF",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 80
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "qzbGsIc-",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:56.822890+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "5LMJTs_wRBiwAPaF",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 26
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "qzbGsIc-",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:56.822897+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "5LMJTs_wRBiwAPaF",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 106
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent:  instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\\n\\n\\n.. code-block:: bash\\n\\n    tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights <llama3_label>`\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n    <s>[INST] <<SYS>>\\n    You are a helpful, respectful, and honest assistant.\\n    <</SYS>>\\n\\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\\n\\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n    The Llama3 Base model uses a `different prompt template\\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n    Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n    sample = [\\n        {\\n            \\\"role\\\": \\\"system\\\",\\n            \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " attention type used by Llama3-8",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "B is grouped-query attention.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "WbLMJeWt",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:43.468600+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "ISGpsBHRTjG_DfWw",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 80
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "WbLMJeWt",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:43.468641+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "ISGpsBHRTjG_DfWw",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 26
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "WbLMJeWt",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:43.468649+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "ISGpsBHRTjG_DfWw",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 106
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "    \"type\": \"function\",\n    \"name\": \"knowledge_search",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\",\n    \"parameters\": {\n        \"query\": \"Llama3-",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "8B attention type\"\n    }\n}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "Llama3-8B attention type"
+                },
+                "call_id": "50f2c13d-14c1-417e-bc85-89e23afab120",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "5I5ujhpm",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:45.629100+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "5LMJTs_wRBiwAPaF",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 40
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "5I5ujhpm",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:45.629127+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "5LMJTs_wRBiwAPaF",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 48
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "5I5ujhpm",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:45.629133+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "5LMJTs_wRBiwAPaF",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 88
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\":",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " \"knowledge_search\", \"parameters\": {\"query\": \"Llama",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "3-8B attention type\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "Llama3-8B attention type"
+                },
+                "call_id": "70b24279-f0ed-49cc-ab4f-9bd3d7af9554",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "9GrKkBwq",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:39.870328+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "ISGpsBHRTjG_DfWw",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 40
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "9GrKkBwq",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:39.870341+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "ISGpsBHRTjG_DfWw",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "9GrKkBwq",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:39.870347+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "ISGpsBHRTjG_DfWw",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 50
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.8190992, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79099923, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg: Founder and CEO of Meta (formerly Facebook) - Investopedia\\\", \\\"url\\\": \\\"https://www.investopedia.com/terms/m/mark-zuckerberg.asp\\\", \\\"content\\\": \\\"Mark Zuckerberg: Founder and CEO of Meta (formerly Facebook) Mark Zuckerberg: Founder and CEO of Meta (formerly Facebook) Mark Zuckerberg is a self-taught computer programmer and co-founder, chair, and chief executive officer of Meta (META), formerly known as Facebook. Mark Zuckerberg is a self-taught computer programmer and the co-founder, chair, and CEO of Meta (formerly Facebook). In April 2018, Zuckerberg testified on Capitol Hill about Facebook's use of users' information, including the sharing of 87 million users' information to Cambridge Analytica. Technically, Mark Zuckerberg makes a salary of $1 a year at Facebook. Booker Join With Facebook Founder and CEO Mark Zuckerberg to Advance a National Model for Improving Public Schools.\\\\\\\"\\\", \\\"score\\\": 0.74697095, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Forbes\\\", \\\"url\\\": \\\"https://www.forbes.com/profile/mark-zuckerberg/\\\", \\\"content\\\": \\\"Meta CEO Mark Zuckerberg \\\\u201cloved\\\\u201d an image on Facebook known as \\\\\\\"Challah Horse\\\\\\\" that happens to be AI-generated, highlighting the amount of AI spam on the platform. ### Meta Donates $1 Million To Trump\\\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President Elect Meta has donated $1 million to President-elect Donald Trump\\\\u2019s inaugural fund, the company confirmed to various news outlets on Wednesday, a move that comes just weeks after its CEO Mark Zuckerberg met with Trump at his Mar-a-Lago residence in an apparent bid to mend years of strained ties. ### Meta Donates $1 Million To Trump\\\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President-Elect Read the full profile on Forbes: https://www.forbes.com/sites/kerryadolan/2023/09/26/mark-gets-meta-zuckerberg-talks-ai-and-that-musk-mma-fight-thats-never-going-to-happen/?sh=671046e73037\\\", \\\"score\\\": 0.6410185, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " current CEO of Meta is Mark Zuckerberg.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "LWwngTMJ",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:24.889991+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "K0psyd28TdSkb8LK",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 1203
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "LWwngTMJ",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:24.890015+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "K0psyd28TdSkb8LK",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 19
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "LWwngTMJ",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:24.890017+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "K0psyd28TdSkb8LK",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 1222
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "brave_search.call(query=\"current CEO of Meta\")",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "current CEO of Meta"
+                },
+                "call_id": "f84788f5-ef46-4e13-aa57-3ea4ecb223c1",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "brave_search"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "tWTHAFOr",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:17.453332+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "K0psyd28TdSkb8LK",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 34
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "tWTHAFOr",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:17.453359+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "K0psyd28TdSkb8LK",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "tWTHAFOr",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:17.453365+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "K0psyd28TdSkb8LK",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 44
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " function `get_boiling_point` is not able to find the boiling point",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " of polyjuice as it is a fictional liquid from the Harry Potter series",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ". The function is only able",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " to find the boiling point of real liquids.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "ZFinp6U7",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:30.079245+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "mUx8OGhtSEW1DSOB",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 70
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "ZFinp6U7",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:30.079279+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "mUx8OGhtSEW1DSOB",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 56
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "ZFinp6U7",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:30.079284+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "mUx8OGhtSEW1DSOB",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 126
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " function `get_boiling_point` is not",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " able to find the boiling point of poly",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "juice as it is not a real liquid.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "JtmG7Qaq",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:53.738043+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "g2nkdPGEQ_KS9-qQ",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 70
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "JtmG7Qaq",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:53.738072+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "g2nkdPGEQ_KS9-qQ",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 38
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "JtmG7Qaq",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:53.738079+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "g2nkdPGEQ_KS9-qQ",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 108
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " function `get_boiling_point` is not able to find the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " boiling point of polyjuice as it is not",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " a real liquid.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "hyoRl-YH",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:15.559044+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "pHT6bhi3THO6qYi9",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 70
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "hyoRl-YH",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:15.559075+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "pHT6bhi3THO6qYi9",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 38
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "hyoRl-YH",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:15.559082+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "pHT6bhi3THO6qYi9",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 108
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point\",",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " \"parameters\": {\"liquid_name\": \"polyjuice\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "ae161bf4-6f03-4830-8f08-3999d20c066a",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "HLJCauvN",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:28.686660+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "3uSIGGP2TcatIhQ7",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 30
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "HLJCauvN",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:28.686691+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "3uSIGGP2TcatIhQ7",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "HLJCauvN",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:28.686695+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "3uSIGGP2TcatIhQ7",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 40
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\": \"",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "get_boiling_point\", \"parameters\": {\"liquid_name\": \"poly",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "juice\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "c8369271-9c41-4787-b5a7-0280822f3732",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "Ta9THPS8",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:52.569263+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "W6rZ8mwBRRu661Ox",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 30
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "Ta9THPS8",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:52.569291+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "W6rZ8mwBRRu661Ox",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "Ta9THPS8",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:52.569297+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "W6rZ8mwBRRu661Ox",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 40
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"none\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " couldn't find any information on the boiling point of Poly",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "juice. Polyjuice is a magical potion in the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Harry Potter series that allows the drinker",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " to transform into someone else. It",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'s not a physical substance with a boiling point.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " If you have any other questions, I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'d be happy to help.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "FRDVTn1V",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:17.228586+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "3GXhBV5vSn2cf6Pi",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 30
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "FRDVTn1V",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:17.228639+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "3GXhBV5vSn2cf6Pi",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 73
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "FRDVTn1V",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:17.228647+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "3GXhBV5vSn2cf6Pi",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 103
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point\",",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " \"parameters\": {\"liquid_name\": \"polyjuice\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "63bb757c-e433-4e14-b527-6989b7ae6582",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "j1OaNojM",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:09.337637+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "ZAeUlaWpRVSas5hb",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 30
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "j1OaNojM",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:09.337664+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "ZAeUlaWpRVSas5hb",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "j1OaNojM",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:48:09.337668+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "ZAeUlaWpRVSas5hb",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 40
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"def is_prime(n):\\n    if n <= 1:\\n        return False\\n    if n <= 3:\\n        return True\\n    if n % 2 == 0 or n % 3 == 0:\\n        return False\\n    i = 5\\n    while i * i <= n:\\n        if n % i == 0 or n % (i + 2) == 0:\\n            return False\\n        i += 6\\n    return True\\n\\ndef get_nth_prime(n):\\n    count = 0\\n    num = 2\\n    while True:\\n        if is_prime(num):\\n            count += 1\\n            if count == n:\\n                return num\\n        num += 1\\n\\nprint(get_nth_prime(100))\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " 100th prime number is 541.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "uwED-DA9",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:27.524949+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "04_0VtRzTY-hrOyG",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 251
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "uwED-DA9",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:27.524984+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "04_0VtRzTY-hrOyG",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 20
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "uwED-DA9",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:27.524991+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "04_0VtRzTY-hrOyG",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 271
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "def is_prime(n):\n    if n <= 1:\n        return False\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "    if n <= 3:\n        return True\n    if n % ",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "2 == 0 or n % 3 == 0:\n        return False",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\n    i = 5\n    while i * i <= n:\n       ",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " if n % i == 0 or n % (i + 2)",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " == 0:\n            return False",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "        i += 6\n    return True\n\ndef get_nth_prime(n):\n    count = 0\n    num = 2\n    while True:\n        if is_prime(num):\n            count += 1\n            if count",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " == n:\n                return num\n        num",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " += 1\n\nprint(get_nth_prime(100))",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "code": "def is_prime(n):\n    if n <= 1:\n        return False\n    if n <= 3:\n        return True\n    if n % 2 == 0 or n % 3 == 0:\n        return False\n    i = 5\n    while i * i <= n:\n        if n % i == 0 or n % (i + 2) == 0:\n            return False\n        i += 6\n    return True\n\ndef get_nth_prime(n):\n    count = 0\n    num = 2\n    while True:\n        if is_prime(num):\n            count += 1\n            if count == n:\n                return num\n        num += 1\n\nprint(get_nth_prime(100))"
+                },
+                "call_id": "297a9d9d-daaf-4d90-9496-2648a659aa27",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "code_interpreter"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "LfE6srhj",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:26.949350+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "04_0VtRzTY-hrOyG",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 40
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "LfE6srhj",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:26.949380+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "04_0VtRzTY-hrOyG",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "LfE6srhj",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:47:26.949386+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "04_0VtRzTY-hrOyG",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 50
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity company founding date\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity company founding date\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "Per",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "plexity the company was founded in 2022.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "25plHusk",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:33.915838+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "CuKMEU31Q26a42-5",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 105
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "25plHusk",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:33.915878+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "CuKMEU31Q26a42-5",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 22
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "25plHusk",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:33.915886+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "CuKMEU31Q26a42-5",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 127
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity company founding date\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\": \"knowledge_search\", \"parameters\":",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " {\"query\": \"Perplexity",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " company founding date\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "Perplexity company founding date"
+                },
+                "call_id": "4521686e-4866-48a0-b676-30333fee6f3e",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "8BkjXIt4",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:33.355430+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "CuKMEU31Q26a42-5",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 67
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "8BkjXIt4",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:33.355462+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "CuKMEU31Q26a42-5",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 37
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "8BkjXIt4",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:33.355469+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "CuKMEU31Q26a42-5",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 104
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\":",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " \"knowledge_search\", \"parameters\": {\"",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "query\": \"Perplexity company founding date\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "Perplexity company founding date"
+                },
+                "call_id": "56701398-4b26-4359-aef2-438255259953",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "QTbOWgfM",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:26.519884+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "CuKMEU31Q26a42-5",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 29
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "QTbOWgfM",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:26.519949+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "CuKMEU31Q26a42-5",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "QTbOWgfM",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:26.519955+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "CuKMEU31Q26a42-5",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 39
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"when was the nba created\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"when was the nba created\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " NBA was created on August 3, 1949, with",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the merger of the Basketball Association of America (BAA) and",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the National Basketball League (NBL).",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "W6iEU_Dm",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:37.336705+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "4Y9e6Ll1RgS_fFdF",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 103
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "W6iEU_Dm",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:37.336742+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "4Y9e6Ll1RgS_fFdF",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 45
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "W6iEU_Dm",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:37.336750+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "4Y9e6Ll1RgS_fFdF",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 148
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"when was the nba created\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n    Konwinski was among the founding team at Databricks.\\n    Yarats, the CTO, was an AI research scientist at Meta.\\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:perpl\\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters\":",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " {\"query\": \"when was the nba created\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "when was the nba created"
+                },
+                "call_id": "82c81003-40bb-4e28-bfb0-9bae122da716",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "WX35-rLp",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:36.663989+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "4Y9e6Ll1RgS_fFdF",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 65
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "WX35-rLp",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:36.664032+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "4Y9e6Ll1RgS_fFdF",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 37
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "WX35-rLp",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:36.664039+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "4Y9e6Ll1RgS_fFdF",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 102
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters\":",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " {\"query\": \"when was the nba created\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "when was the nba created"
+                },
+                "call_id": "8fcbc41f-3723-46dd-aee4-948caaa2b458",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "prompt_tokens",
+              "span_id": "vNEXImhz",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:35.213589+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "4Y9e6Ll1RgS_fFdF",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 27
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "completion_tokens",
+              "span_id": "vNEXImhz",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:35.213622+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "4Y9e6Ll1RgS_fFdF",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 10
+            },
+            {
+              "attributes": {
+                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+                "provider_id": "fireworks"
+              },
+              "metric": "total_tokens",
+              "span_id": "vNEXImhz",
+              "timestamp": {
+                "__class__": "datetime",
+                "__datetime__": "2025-03-06T04:49:35.213629+00:00",
+                "__module__": "datetime"
+              },
+              "trace_id": "4Y9e6Ll1RgS_fFdF",
+              "type": "metric",
+              "unit": "tokens",
+              "value": 37
+            }
+          ]
+        }
       }
     ],
     "type": "generator"
diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.pickle b/tests/integration/fixtures/recorded_responses/chat_completion.pickle
deleted file mode 100644
index eb7534e6a7222e3faf9edc4a07629989e0466697..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 888589
zcmeFa>u)64btl+TcUx+zt%qev?rvEQvAUf}iJ8pgn{=@>tXDVXDt4)gt&&_UB_ktl
zW<)VEB0D0IEOxsWV=QDWU;;K^`jhv=0{ai_0=omO;n@%R=D$HRn4QsLTEJ>7WB1!)
zu-M-@_uhEqgJeCEtd4Gpof#Q%<J^1CJ@>rs@BHgm{_e`}f9@sv=aZxIPj){4(o5x1
zN5p2u^URJpbA$d?=Bo9|^5x2gAH`wMic8bRy*md{EIKzu6q#*N4g*iDl_I`|U#-B8
zg&(h#ej4-)GZcntIKu0;dY)lMksHOPAD5=4jSnIb=9}o@O16P_-92G+sDTI#3f~e&
z?3gjX!F2ry--NLrg!U@E!3*@W5977Xz!y{ZryBRlrBFoOz>h?u6@(p==;t4&jdCe6
zJ6+H9+l{Unnw<zg-BDL=@r5$FY{sH}u(lO+Z|UFesEg(Jpo@;jLATMxqZpB{!0o+I
ztktTuTCI+2^c%jdpuZjSVIvN9g&(cerj1Z^MeN3I;5WL$H@)~^tzNB7;XaTRhr;Zj
z$*r&__(~iEUIPQ$a@%wn->&eNvi!jbTuZE7<m>#oYWCt_)hN+_N|%TZJl_q2PB&J=
zz#}wm$XRQ2<Wx4A!ZG*UAY8kco8rsra@FjDSPXYbekfg%1P?Y3Dw9#BUukUb-JiOD
z1V%f2g$C~E@U^z^X~2T;=<s(er|0iRNB8mb+su6pFp9N{H@Z>P^;;sO`A<I`9e(!;
zc;O1V<Y6EK|F)WD%nk<c3N?51w?|Wd!EgLxqhY&N+-UH(FM!8G*NTod4__5~V67wP
z3x{tFZ|)lZt{ZY*C;BJ&p$7K?PyW#{Pi|39|MuwcRrDr|kA8l18=r0A#pcoBI|Kc+
zf=(y!t7--D3nGlumUtLn<!5qqI4@%)@zaqrdHB*%@9_5;jqZVs)#X|ZTn<fick}RD
znX4^Vcy`o?1dYUBAC;ZSzd8K<%<tXM-2=bVbG_!_7rmejo@vrJeJLVV#>hs2j~|`4
zH=TE=fp0i({f$$?-RY&T$P3PEa_o7?oG))WUva*cz48s`TZdmn^KGophzG`b%lS_B
zj+dQD^z7v=M;xWCPpa0-o6b~@j;SZ0WPbRo*9@zNes@G@L><=?wqg3V5t*)SOtQe4
z#KK}B)Cyg!3-7?d?T~nuC&Jw^5&gO9vwkt=4U9_5PK=&6=fdzgJDkjNeJ8+Lh^l<4
zYMZf1YYHqsaR=;nTXGv#&zCT`jiA+lZ1wq-Y1`j$Mo+xx{bc9mGq$JQ5oTyPa_16F
zW~dl@7j1L<j$|gQ7ed&@#+7UD8$rvsi4EC+5QGF}!BFgOXLEO>-S?_!xjygea%c9<
z4)*2CJ=cp}pKho&Lvv3+0uvWtAI*NHjoKEG6}ny8(AG+QyaN5jz)r|o41Nbcu-bxU
z7)EO!mp<%?@SwEH+5xLC8DjWG-cO*YVej)vjmW(VQr6%WNTf;f47>`rh2q1W8wz`k
zv=Mr$W%fL!VSNG}i}!TrW#<*=RqWm;i~To;3focgA}8Y?f3G14|Lw!yr75^#dYC8L
zf%|vU*GN-oSor<O`MGo9@O2Cu^hRvmaW6W;^IzA`V|*iVM_z|~`U3Cn!tRqzf9d>u
z)Nb`;J^73LHu-bs=Z8j?ld9B*GEe8)q!K4b&R;tJ-Qky9=$qM1H<hHvP&-=0o;=+&
z@<ZtTa#p-eP7!r~KPFln>+p+uV)Hz4_yroshHD=kzTH}|7w6|&^~#)So0WNU!K|3I
z<>kuK+@iI-GC#ZA5-ZTvn1yKBAEDVVWmZ0Efv;yLiJ9f_H8X7YIuQ2}RMW#(dCraw
zzdmFSIEvEo`3L78onI*4qG^3C>RFZ$_6aQLIB65dZJa01>dzi&5uf(G*szy>|ApPK
zr;XHF$s4|@U7RtWA$l`PH!QM5C8}oBA?zWUZ;7o(d>gIZ(`;Q<e7sh=_`wGoS1+OM
zgi#-9e@eJDv*DD6Fl&i+25G5k!z|xrdOfIkT68IsX|43}Brl=KRYMEDoS>ODCdrW5
z#qak%o`eDJ3A**#Lt*t|xEms)!V1?UZtR5~{pQ4RH(H&UF`Gdzu9ABM23lvPBW%~K
zv@y=uc4tJ=zzjXA$f3t?n&GbTXFY3IgiR5)r;Y1D&$mSgZg5N&9toe7z$=a`T1G;L
z@qViXn?5vt7zoe!Q?u)tK0QG<z3T-{(=*mvEj%A3ci!nmXzPZnZ@cdLrf<2pGieL9
zJ(_&qgO!)G_C7l;pb;duU2#G;0<S*QE!_$O8=jx!hIQW##lEq<-J7k|moEn&$xkcB
z<sgDPCU*3LTf%bu0A}Pt^6U?V@4K+(jZLu+Cz5IFdv6GgWpdk9Xi3Ih;lM$lZ_sX+
z<es}O^yVvO>{u|Zr;Yc}80pJ*;TDM2yIo_`3_31GOSqX|bzgoMI6fX}3rv81!)>zb
zdm?$kx@~rhn?VG(6h>9IPmVCN4S$hw4T_X;t(`V-i|l7B>Fl&|EAU)=zv)`A7Y)2S
zj+6NryjczNx+fkQci<55;*T_G#6@s(WM+v>(I4NU4zI&~bO572g6h{X);$+CdP?qf
z(-O@f*flm?EAUKkH@MG>(Eaot#Zi}gk?X@6H{K0{Ue{Q6IvCM)uX5FNJZOG+8@*JI
z!@I&?-!R~gV*`cw)Dy-nG+_2{FPIno6_IO`DMM6l7?1?g`pGtPeSPD$ZiSXkWWj>Q
z!l_l4>npXDm04U1&3*EIky{bJ_&v}2WZK9q-zQvSPrT$in4QW;$r^vkBH+*E2=g%i
zs3)S>xNRO(E@QFtCI0hX5V{`)>I#OMFa8*O3zv^Azuhvyh-6FZd+!F}ZWsieNM5E%
zIe^Rd2IdGdiKLwo^g<Y&olXx%&4F>p3A*xLZb#C>A8%pRWA)~m@0%2uu+mkc#h;ZE
zf2plI4MP~~1uk6!ooYjhd2p<>_%V23A|v5wtiIV=ZN6%3_{RHI96-L}!Yur*$t|IP
z$_0kE$*f`W@ZV6hJOO!zhKV!XI5MCUlZcC?hew2a;9q>PLactip=H7YbU?Sm^(k*N
zK@=QB28P{+<nV$nt<NZ6nJXSbuEHrq&5+?nuIdg(e-Fl*P@;3*=mdN8Oj8^L82#C+
z2CP2;4>73SH$&TSJ8(Wl<Oa{S=efIZiuc;2HY%soo*TwJ%m6%e9!(W`Nltg83DXmO
z$IE@V1P!qlc-ZGaF~&|&8Ye=!DrY#yU2PA_iHP(yC?V)oq-9se9KF-)cEJUp$HG*>
z#ze=lFVt%03QfuakA8lEFVXs5nVmbu3QsrC?Gk1SVif`p{q4GYL2SUc+7_5RNPQAi
z-5x&kWJ6N7u-Wh&3A-(ZZ0q~``&G+tTD++fp)<AFnfm-px}2j5*wBe-LMrrT>!1=j
zZY!=h;G7CgYDK@NqA00=Njy<?;*Qs^4$ulD&raX+pl+EKO$|*x9FJiLrZ@M%I{R)c
z9Y)<mp2ABn47kprzSsuh4qk8|L^O@*j*YGe@NPT{hILK#xP#RvT3|$sOawsyjtB+6
zlbn^f3tj<^bX5DMXsN!08Cp2S6x|d8GXYLZm6$7cdTuO?EjZ+9<?xi0_1c^=?$L1H
zA2KI4mObPNTnz`l%@ecnBPd)VuAu!&qH9&Gh}y3}jp$T@mYj-~+VA>>@Ox&sM@#Ah
zAIk}A3X3Eq6gxBR=#VW~e#VenuRs!F85rxBCBtGh44xjPUj<56Q(Z(9NvUE+z{?%a
z(YUmr?I}$OGJ&7QGRC?G$(X8`Rt$B8=6_d+u7N4Bc4^?~X>z%m{z1|JHBAC6CBwnm
zkDaQqMRzlaL899Ykod#AE;yRdrdA4wv}tlcg(jE<!V-b5X{J8_ePJR(t58T-KTNej
z((=Km&=g_QK)9l#kmHiZRQ#Z)Qs>n=1&s~(7F<sOJhw?)jr9P2wMl9YjOqIAnIGTA
zrU8x2{m7)XG!30Y3d9>`GfF6P@OBeAkR*mT72#v25fIsE*oM0c+X;krpl1-0lWcTE
z@q#8I0r-y(bxgm6ClThiVcKXIRHJ>Vo9PS5C_<5s(Fw2`8mFEZnK!df&TWHiQ|okc
zgw^)M?1663LP{9Hs_-h1tV3J^?HvQ0ZWtg3B-zaNw2^32XWF1(ANq@(0een<m!S<J
zotkOQuhXCzdb*^F*so(gz(BdajYtW}{C@edSeu=%ubfJJKrmuVB8o5?P(Shc?ZpKH
z!|l<A6_W@%AW?-My1_m;4th>Iz&uW8>?@;MWfJ;~H<I$9=#i_G<w@E#$Q1~1*9TL^
zm0q{9xKL?&e#JuI5h4h}hUjsL*&%$~s6r%ENN`nZ^~&sgznEbecPp7>vfFs7Ww*c=
zogQ=>{C8yE@J42Q+Rruy*4vP+Lq?=9MAt*tHV-g2Zim-LU^EeVMW1s5g!fO#KN?iC
zO>y+tAiKqbDT%(WZ}jZ|b1U`v+QQQCDZKwlsR*;5u`oM1eKNlP$8+@Lf&=F$5OG3s
zLC&*zI3y7u8>7w#`rp)n`+Su8mpTaln`{W<H_|zKsu=pG&adU&sdM&M%D0<~q5r2%
z=U4d{`lrsk^XooxoZmRV=?|JKYMg)Gbbgx&sC?@D_UXU6%(^QX{rUK1zrvdhQ$G{>
zel~PyhD;I<%i%X=Tv|n>(^#})|Lu{}qQs4KdX5`7_^_R`7IeNFk{WIy*v_heQ~?C+
zTG~sYKuQY{I~DD!&`ws_iKZ>~daVWn1Zo9h)PAo6EBgoJxg~#05sN<0v1nq5hKp$o
zVOioYVhR1124d3EW|GVVF5tkjjzH7HPZWS=4h2gp21WZXOz?Ry1nOKs%=52Op>j<8
z^{GdT-+*b2<&97<7IoR5TAeWPkJ~c*aRy=!SB#5mMqLs^1c57&@dIz37m0*Ix@WHc
zp0SesZBEoo34F(xHP+UQ8cfkK59se1?-+ATOEO?%9jSMQOsTPEEb!RD?F-A=HNH1i
z65jv?;4nWKFkTiIM|Z4beLdwU&{B4d?;EpYCCHe{^A)Yv-0Y5(y00@w5lI-SHDi&!
zj+LOA9vjM{nwTK@3zv1nkDWdt%-(aXiLp}qwHl8fv^`KVJNCi5NFgD}1{%BEx=gpQ
zUPo<WH0Sh>97oLkSO@Y4^@I_LDD1FmesdCrexiTWeurV#PmPuMw@KZn<x`)SfTrNb
z<vh#6Fqj%|=GMQ-Cc^#0;oFP$!jd&NJ6Bm+Tw1Bj&o8wq2rw;IEX!;zEn2mvZLJ(3
zcj|vX{A$0ZD^r<&fh4R~=%=Hf{GT`(Gg3`@Y(?cbRbg<m)*O-1fCjhAksH>KE#oAi
zt@$n^HZ={W6<h<N<4G3p6E2@qG4RN|x_=s_oS!zIE$3|-dIsW|<<Z4oIAeKq&)f%r
zRGH*3P(Ih}45iogOPvuWcQ(zA9)VR|>X${^K5ZFfvq@njzh|})78?1x=2i#pqGg@A
zDQCN+0%_o}Y~v)7<PC`muVqsD91{vHdbChV?;YL=?k;O{@=mo|$_{KJ$zzC0CGXTa
zRZV_XYK{Do(JbTzeofXoCe<SSiGEIWi+cWgetXaVM$e`Dd-|K+Vz>Qmf8-|n)YB<X
z$%i_v5($m+z>mG{--}|TFNXKG{Vl|&MU!IG<xW6RBw>-?m!iD}(nexAY&c*k3?vXB
zpm-79)t)CV$=<hc9?@v-hh`TcIh>kuJCs0zh@y+3xXE{J1~I2gUJFC=N^S-SyiyAW
zWmwxdK9M|K0%0?XvdJPObuL3vr)0`pXsYl>CTB5EeeiO`amO*9w0z4L3^GvoVPU?W
zjT1eS!=KMg1-t&r8B@XDL(UhCDl(X&J<7fJIpI^TrzU=i2Ld~flEM#n{fJoeDHpch
zZSK*5=PNSbcLT@j^+9`FrFT%&sdRzy11RZe4=Kh-aX}ml`+|vMFaQ~#R3XF>D4i9_
zM3Wb{uHC%VxV~}Y+NH^DpKuAN?|>V?PM!~e&{JN3tQ=0k@R7BFJW)5UQkFvjCvdjl
z1WuRd&_BZ?H11+kdBW5D!_WDRGsyKvN};55DoXf6-mpD!!MH)oo8tRp^{u@+VYiI6
zv63)}@jz0XX*aMPlqYA99y`+tkaZhIGuB5ldk4<aSn2!SgSuU>KWr~A*S+ZBSV{Rx
z6gj@NU;)*o*7j$%#!AfVGjXT8Y%f^LQP&zPC2!39X!)lrAAa=XpCBz5Igdywtb&&&
zrzXZq*8BzIcCa5QpY`M>=d4g3b%?9Hv*RRGuY<%)N^(WE^w_1r>?)Ej?N&8%%&wS%
zkh&rX)Ll?tbuReW>8%j36VdDmzYMz7cuu9HvxTmvV64nARh}F&dYb+W?c?RySv}{5
z%@`<p=C3MH6g~B7q>28q4(pc{$4Y9?Rurhdu)H$YoVO|q=E6c{UYJXW0@Y?K%jRqg
z=k(3F#RZB2J@~JP@HmM0Jox|c;V1ts{+ysO@Sc70u9@4n`gV@2>>epSA4h&Yx&+Z{
z;8bh))T`*x`M8ghUdV9Fe}C4(F@-PSiF^Ssda!ATAl&vdwiCiFFkkYbFzaQRqPs~6
z(?N|>YGFAkMs29QJ89;ay}VzJrH@Nx<TPz$^nuLda+6D*KW0zP%d#SDQ-p2KGhk=o
zVS`g_7<A>>FwBMkWPW}l&Kym7CLxj$0i$o@{H&mptOUv{PvgSxt{h2YqUq~-2NtX1
zV<P3-+I2O|1Q-)dUr}x&ZF-D}jPLY0j0`)#k!2rH4&osYqQ^v4y=qKMYy}D!5dksc
zY521^v^34pBpeZEC=nHR8WW*qrVMZq1o^>nuP$Ja=(s_$3IR<ayw;fLTLdIaOGe)i
zaU=kNHUe~^=ma>1#sB~z5o;pUMj##?6HoxYm>v_g3rwkw2+|1dWF6yEJ4UPLGc-pM
z9mGvzB6c=mNMhE+iu18>2)Z2U(Gxp6JKclW3H%AE#gB>7Z{u*-@W2b)C}X1O5BqGE
zQqFHZCZc9b<TkxU3D#=;hDfGbklt#A{y`pJ0B>}J^b3xO(iQUc0%pW6v!^#DIqG$Q
zfyMAhw0`=i#ZzM<_#y$u2Slw*tpoWw+kq}(r9j{lu*5!L$?-ku6pV@HH%3kUxa1>J
zE-<H~rWj|De9P-;M3ylP+Es!e0c;*y>X{IaOw|MuCFOH4Yo-9p+NGXw>zHW&!|V`p
zYAIl5gb5TF<TH+I(*M8!hRC!ryYSAmu~5UGCHjOK7GoNu-(%<>0Zjo@O2NCS^P3p<
zzf8OciD55xwte9MWf!h=x2l+A;Q-|$7tb(n{Sm_o8}V=tOx6?MXIfVcI%x1nMlSE0
zr!Y2iMqk(i&M$+sLA#<HJj%5&$;4+gT4_z^Y_1+q3{887FbreD`w9R8#;KUJZIh9E
zn)YS&grT;&QtaTNB4fEq!!T$*kxt+Q`-w%UwMBkdy8`IY4W*Vc3L`*NR2B)UNkR2h
z*l`DxZ-$$ItjGl&aDrspPb{--f2e$+kVBH5HYCKdowrxEOIV}yj8w#jXlQ!HrfR!k
z=-cJ6lBv-Gm@}uwWQ>i>lS+k7pQH;^N{bQ17$=rBE0`#mr}W4{&rYAzBmyrphM$Kp
zizU!(+5pCgIdqg(K9pL<B^*JM)csC;Gu2mlV!U<2JU_Xv*)0!_E<%Du^)!aG1F&pj
zh!Wob2#>&a2zOdg<*Wj0=L}ah`-}xie(C%>=id($dKrdI_U0o2aOH5xo(vEzhqd!d
zRg;5Gr5!yuz?qD}$}TAi&x4O}1K*_(*ajfeo+fGhBK~vuCqp&_<-dc~6=NriJ-1oy
z9?&PwW%l9;mCby_RA=g^oZO2i1%As9ai$;NW9QFcqEvq8g2!v(!dW7F4v7YU0LAe`
zcmwEI2$`6qgV0wJ#^}+#5`8J%mp;TeI7{?jY-mE^Y}16&AS*2Y=7GdcB{v9Gg4IK|
zE>6qHJ?JcTWFg%NClq;rC{xMB7<p(iZK_VtJjiskdX{?dwjEaqBzEdv%JZfBC>&z?
zWu6h*{?8J@-_L3r6J0N^qRc4pZp-*j<bEX9%JtgxBKc6gGe!4$m1JRgfJ@->g)0zh
zpXBb8eX0PtMg0VFW1*TxxK=t#voIe(DAM3_QJQjTUFnBE1uH>6+|X-4-SyBjn*tje
z?N?_*@_(aTB3c!F2TWi2Obzh`J*NGG*#>ZOnQ#PfaN!J%D6>NZbF@5lhRAyGO-h^o
zUvtItvW3!?>-LhlyfjytTUu;Z=I540Wu?AoSL}uPg?X_k?Ai(y|4ojH9?1s(?cW|f
z_<zwdm&namMN@!0UqOB9C(x2kub4cBF7PDV?_=$Fr`cp4L9RcsD1991HB?}nVvUlN
zRLdew7JpEB6`K1L{i)!ORNv)dPH}vOD#j1q0NbTdKgWUl<^g?%89k4NI{XN)2Cndb
z{&EpFcrHcoPI+c{7)s9&0d<mBFLyk>NSsCi=&2%c`q2`ni?~4%H_*r0i@3qiY@W}2
z+~5&q#M20ekZb%3{H~`jKl}JQ@8r;(S6v+CL6zaN<3et!4~I^+>wwz{&O%y(+6kw<
zg+na_`F>e$^H>*MY9nm>1AdK>Zn5Kx1Qj_XZ*nHsgaF=1|Bw9bNqsIb6l6mu`Z?rn
z($S$!1$j+77lzmh@`lu)m*1teF!{8L5}%S=Ieh2GD8*i&=piRS7xLAxPN>J+Wn8BY
zm#c&4ajjMb&<5&wQmy7LzO5jxirlkC9PFY}*;<VXLv-Pnrx0`lMXONw?qIE61sw6T
zQ7)kp8EVy`Np|Vb6?QgY8sO2S%OsZgOIiNF(Z#ike7%S?o|UQ|=OEIUJC5~BF1_>$
z<r8O;B%eubqY-7FuE|UrPG&|SSq|}JrMtnwLzU&BwvmAZglt~r^CbR{xNzmeiis0I
zw7gOFkxDLd<_*PD#x$n2Bz5>;C~0~alOWZY=)6+p@#7h02B1BU2M^K^Q9sPOCY44r
zmvTbYKk3#nk$#bRFse>W+(Jc6pz=|c80f*NLQ1pJXRg2#9sp;Jyh=7En&(PsU0#(N
z$7iq*GUlu=ohyc#qbM{=DfDBL3M*1700q0(p%SzDw5BEzd2Ez&q+;F*S(iiIV;b8j
zUUDY5zyV+8=}MMaiP8bbL|`>P$2@?xOO>V>|5zpu&_E*A2yPe?mG2}g$Fv`yJS(km
z#R?Vxm+8cYcks-m{j+q4|A(`NuzHu)7Pc=M>1KeU&d7l626V6s;up2U4<7VTSbhLy
zwLlg<C&;2DDuALUw!sq%b0P;FdYem@jo<(rcy&MQxu|OG8&`y9xjj^-4^iLDf#7h2
zOYpY2ihA<)28oi>71rw~XA6u-PwTTzvO(N+Tw%Rw)B%*#o#LfU7*qXkz3T`T-PjBj
z<o)1+NQWRx4VM+5A#IsGFJ5B<?h_bwWSU(SC{@WOpETR3UuzdH_gpV_k<lJ_)n*74
zrx8KDu^iyMX1@*;GCW5$ZJl#q35`k!bS}sk>={V0{O-q-1g$r@YE0?_-BjwD*odxS
zX`i7ii^G+?oFPh6(Exfsp3DdbqCeSQ&QdR4p8N#(dS@m{>A?l024reRaniuy+s%d6
zVrzbBq0*`gyE2b-6#xb<pnz0uK`hN$X3eVGN1R)5__cu@;E3}LP&evJnF&FHOFAE)
zsU(Fz_?<JBr0|$J*;x&j3{^q0nFtv!q_RN?8$OMg`p}na_jaj*A`S$U-?&#U!F$sU
z{75w9Q6cQ{n}IK+dzOGIpcyw<p3(W1JL<|U6A%Yc8U9<iuG$A{TS51h{;lvD8zr*~
zU$<SlB;DNXy9N+VryCC$W%`xI_TEU<Y3O7Y$}Gk#Hwc9?n;#D{DL(88pnX8Sm%28c
z&Bw4EXH!t<?f%gxz*9R1%IpxgeqV8%=jj-KJH;Q`8PGm<$lU->*_eDxC7j=swN_x8
zNra7|evdJc_kH%a&~Yg`rYjM~2#5-xQ*xWE8n?7pW=tf0PgdmVm~EG^u(|;stK~E1
zWHi5LhFEF0f;blAihz7k9Yj6XuEMvUpQ{ia|9$g-VpyaC*(y$e?U*a$5fjphKFlWy
zqndHmC_yVa`OxsYLOO7VE-%UE`z}D>aCTU~ijFk1a)yFQtbG-#S>b0a{EVTn*M|EU
z&yk994*YFzpXhIci|Y(IhYsr;u#tZK)A`%Z67_HKCO~v6=RG+EV;pKb7cRJkdd)KD
z=9`s;rU}QJy@1N?t=3${T3EEj(!yfPT3I;J1y>&JgTo0m8PY&~cN&zVUUK5+GjMXK
z3%6&v)Df{+@jSC*&fK8CmAPuYvV6I+;ltz7vncv>@6JJlii`RB*%4pU$^;S_OFyN;
zihy=C4M%w0R?mZq9RbkAM5Ob~YCkWYM=uwsD-l~4PQD^FH1IR=(Zb0`_H>zD%?%3w
zt(Bff#QGd*l0%$bpZ|DrOx}>n2@%?SP;^X08Yd}-W1{Q#GC^Ra2abu7uTp9Y>7Omr
zi&z0ov@PeExlTu!L?eQ}wpD8_FWPgJ+2)d2nO|tuD$8PFxw71vU$ItJ=4b0GwWp+U
zzRJV!*-|<G?emKR7b+(_ehmURi`F>YcjB45p<2PIg2>}jIbU@3on=I!zZd#@p}z+n
zXM$uBJ`4Tb8HP49;vCo*gVjsN<h@g*lX8TRPtXnpj-sTA&*wU}_}iDCn|JMWvBJkp
zdHW292X*QS66W+r$`~I^vEo1~kfPZo+Q12)2q4alrT`YW8-!~YbN<uI>T*>kZlUoa
zgX;x2j&P2gJELskGnAJ9o#VrVXDP%sH;{gHk|)TRJWKSyo+JaGi<sb?sGGBvZB^zM
zt+~p4YnEbymRVU|ns3>{tS{AS)`=dP(ILUlR9b%d`9%PqsgCmEc>z8_W~@Bmc3b7l
z=cBKrgHY@h?wy8fuaz!-@WIB_OSr<y2_J$R>dq<B^1Uk(%tdQ-g>qM%HO*wHpZgrJ
zA(6W=TK;D_O?}~%Y3liO@i7I@49e$6YR&L8bvRMRMAr;xI!&7Ti;yn<Y2ng2?}zbx
zoHxXdPyZ=gIR&!Bi@4i=ZevVa!N0<}lAdS#B@_>&b6=*<73`7<d{n=27zEC^4_i%U
z&U+0#cjfwH8qv!<VA<1NY8%CMU<$$b3-%?djEzF^s9r;-FL6%VkLZ9k0A)~*>N(`t
z3{g4PjVR~{NLfQ5*9WLvaY-Mv>pBM!pK6PGO-c?1c$+$-lsm;VCc<Zvf}&i=lXzGn
zj^T7GYE$C$`n0i8vW=zyz*o0R=UvA{>F;ow^|_9b{l`L~I*$s~P|s2k72|XSfg_yJ
z+Q+3l;Kml*bv)A}vKMgF0=~N(!T_D9KmQ<IMO4gr=dXcp@GDh0^tBAm(8%PCQSrIM
zb9Z>|?b95dJ4?Z^Z!oN04ms;wM8)P;Y8JAUXDhR29Z@kM>Xj8Sw^UiG*IQ=OZd%sz
zauF5Vl#B7WVqU1I^=1(j3w$K4Jgcf&L)C{<<rx=IG5CL<dWnOf^HHdCg*qn&Fjvo!
zIyb~AF~&I9A07+WG$s$iB6+(wtaK{4CLISG6SYeUuE~UV(IQVwg4bjW1x9)iB2IsQ
z)*K{+3Ls1_96`QZ^55BJXd4{>GrJXsej*ejpXO(pp^Vb+1|8!x>;O65j5II@m;s{~
z70Bo09Z*>*1Vuy{!T8U4h)?bM!M-PKBs)vYzkVl%!qbK~nrf9LkpRGan_PLhiNYQP
zNR``b8AtFGAmebF05G)wmE<=Wf<#y~u9!Y~ZB2Mp;$BF?!}QxdBx*xY_+VN1p+V-Z
zIM_!SgovopDME}mp6-Q=nnkLpgf4v9dBu74K5YD)ZU5y>=PS<Fvey1LoHw1foZPnb
zvNMT)FQhK|N#!OeZ#q+(>E&-YZ^<9hOJ5o0P#E^odAu~|P<Yw-IsU!i{H2q^Z{y3)
zpVzAoYscsI_Vd)_^byGW`?TxF&^q<h2kP*x0f=qpE^-mv7%O3K5?Tw(&_v?0ge8B)
z7zfPxH(93thr_pL7h0BuJl)FDthrp7uLDtg8SvQ^Ysp+(nYHV4_Tu7^^N)vL?VtD~
z=NACbewBy&=<pBHJtN=WqcmmL`3L78onI(M{!1q<@!iMYj+1iJH=HM7nx8$=l0NNw
z;e`0bn`i9!&~w@O&uk)ZKW}1p$a*aFmQ@<Xq)+7t?+_w%KJUFP@i4v;&^{J#0INxt
zQN&e{8sk4}rT)h`R~RV0J;dr449n^HyV2^lfBD+G8=F)bWcd96u!OD`RkwXg=|074
zk-&@xj5?aW8d#_{<;O@_UiEvON`2c`G_M-QoN>{K<8HJ%GvkqHs}eF*71c52*SSkO
z4H=#H#lF!pJ1(k~fyz!`BgBlonyZ8%FneI!#Dm8Ah76LU*lUPVW|7eY?k306QBaaq
z5#I4ze2J%RR<*kAr_GvH7(`SEYX?zvs0#Qct?AT+(%^K=a5qXxcY5tMw2~HTl7aS_
z47wIkRpA#^NNQgo-KT<rc8J%zJ}L~zj-Zzq+AF<~sz#xQ(6R|q3;&PzgUq`l_apQP
z<Zx%XSz~V1fJP0C+*qnzMjV^E3kxrXG8wv1c=jpM?s=GUi)zfs(ZCbbmK;aC8v2uZ
zAq*{ybO+RdIjHv$^rv`V2w&>&km&Ml{C3`nWFIio+kPdl<VVI%I}CbVbX00*sSr=5
z-3%YPd(|Lp&zQ|<W_GT&P_56+&M)xHKZ4;Q?Lj%GpfOL2tZU|B^&(wjFR;v}?0>mF
zyF5L+0Gl{{)7<PdN{(imQ|yLnwFCWwMw?Xo;*z8oRVttdwNsL!S+%?qySscx$~JAF
zwDy|J?E#f8b!YTFeVyOY%4*jY&{&hP%QVmqg0?4SayO!tEX}7~ypwA2&YrY*KMR8<
ztyK~RihGb>Bf^fbU8<>RyO3-V%wDSF<f267%^SCH;{n3KStc0ZM;O4=q~o@!t{n*l
zW6$(@SnmNH@349vwGJ@=nWkFa34_=K0CYEqxbhK}w&*m4O~u<XqNr~<)ob(B+RF0c
zJa~Z?=%3Rc00^T_Qrj>s3#V4;oylTO5CFmJr}&3jyAA^?CvxBdP%;OWS1+PBp(nh=
z2wM=NRf&5Leg<ky^4LM3B?MHB>uwmu(}pJ~d&my<eUIwdQwuC;<idPUN?e>9hsZSu
zgph@tE)dxYC`+pUwGo88lz%%x3;rY$k1R>MWDpKpRT)X{vF1I~^n&ILiN~4S*VeDz
zyjJblv{*5QZIYSkkR}9IxBX3Q#Z<O<7iK4l8gnhnb_gNF1tPP3K|`o(Ec>{_LOCI;
zZ<C4SBt|o`5;BOui|F5Kl}r0oK&iF6x=P&T&~*d<(72~#n&}-4gWjjM{mD~p@>)*J
z4UrSI%qG9Olgy%=W^7UtW)h5o6@X=PQI2lIGn>M@v?DvRB~TrN)`8xL*yG1d%%)6x
zI!3KjsLd@fk$N6&#)_eo#ZL<|>I%yRXFz#U@7{MX9kR5BW@K#N6}5kF`^=M7<Bm&;
z1<iS;;eDXZr+d9Z|EA4ZwwJtQ+HhOU6Z?dk304YwAQrw8sv=1q3dywmDlQ{g!hWt<
z9ioKa8=-hr(jt{%FoG`1xr4<sst+#!rVv#<&8qP}C=U1C2u}ZmU*$^TGLh=Z)Z<4w
zCAbS?gOc0Q+2>l|u8s+{ECh!iNY}FGwtd-O=5UgkzKs=6#9HM3q1m{Of+hUgbTz8U
zk>j(Cr_?O8Q7SL9LYZ4qRnZQtXog3pV!<yNRiF()$)a*773qJ%&&1&TTxFJJA&n6A
zd4L~Sd|BAQBS9kzHes&7K*lB{ErXrf=nN}7{fmtTNH4aF(WQGEn|HSE8y7F$`RScY
zm*g58Kujf{6TKn^dMYdoXmSp6C{d*|K~lwV(dxZ;aYnTxzrW{x*Vr)34(%kpj_Fs8
zdo%Kd?+M61>Ru?mYiy#>Bym+o05~4=frm;RtuHe_g%CI{W(cY$H0b2D#>JTkL~r{%
zMsw-1-JvqU4(jq@N)!3?WRn6-|Fm+ar1X>d6XeeJ>3Vtb&qYAS$)9sqx4;s~JVP6I
zVPhPmN}ik_4D5kq1S{^D9&W%og-K$kH?kHFCUW)gA;<KFh_|t)?VH{%<_3y!+mZ6#
z{fM_?-w+R}{*lU!&2VS35QsZ)J(-f^2h1-_I&7%qoCgE#gJ*azK&VHoVx)+HsB>yA
zFO$w8H&msJ?4+7(pL|l>H^NYR#MQ?pU8*3bTqWPq6qZR+0#!OlX{SKd0ob-l1_<e!
zq(SgDMycjfEx?ismFT8LN*kzLrafPYX`S9HUqLGv5*7lsjVAkq?BMWFo^sIR^mfSo
z9zO;JClru_(Ee1Z6AY(pJ(-aaBDiY2D|`W0w0AJgVmj#+^(T?dEQQi{uo@*_(7zZt
zkD7GPzFCu1z#&?=<ZMnuqb1H2pdk&Ik4HDJ*{C5~Z|Lqd_M(Q8iw!yCEDyJ6^d&p@
zkrNqzoC;E&qtou(QV(T0Bo|PGZDs@Pu}6-5?0d9gphMy_rWBvSKVc=6VsQMkVut;|
z^j$1cm3c0aTef{k=umOqJ;2;!9m$bbuUa$i$?vqJK2AT<*U2#OxQ3)`d~&+)=hNC?
zTE;C5k#Tbw-O9KT++HW)&Rj98tj&`~mO6BoN`$~iU^%1|CYv6ru9J@iS?>7T*SH$D
zJrkQn%zl_uWWXoYa@z>8kX}HJA|>o%QhH%L1;H+Yf+U!TTHLLca*l2+$CM;BO;{B4
zURr@f4q6^makJAT!@{~>RTgnk)g$eF)_`9q2xCKjYb-TkW(1+xNWRONzF<%(SzC{(
zemxxv;1Q7q9?dLSZVf3#*V0zYu)68j3oLL|1zV^SY;{Gv=<}~d524Ya6pV!_aRP`x
z5*`_<sI9xgV5LkeerOjiB!pX`foYqQ3@_3lnM|5PV)r<}pm7TrM~tvA@jxWJRwZ%A
zB-3v>0l%J9vwM>t(yD(z$R@kVr#=0N#MLCGrQ3_;rTh9dq~l@5q1rulTJlnZdmIOi
zj@RYT8tIG7WyvqHE%_sOB+r8KJO+vdqNK0vjWV<GR7A=l1ORQA`F=4){N`kma`Rp0
z-!Ke5%_8&}z-DH8t@?VO=M$=)-b;x91F=PqiWeHIR6}-kC!Ny9?5UUewau$U`N>{a
z1RjfOiba5YmFr)haH;@IIo1IA-+lh2&%H$dP?5iL5GfxF&mZ<>jg1V^KS>yog7iaP
z{YU-DGf-zZ!f`{!elwX^&y%e0z!M7B#Rz|X_C(q<q3W5%Tz}{Jm4iDS91=Olifk+P
zaW^O`;>8DR^=eI4x`oy#%nqd67D}AJOIa>)&a%`#=`v+iu_`JnUqGLM_haqi`OUV1
zc_>2oCvdD6Ad{USJm{D0U>Mv1x2}w{qQWKag||oDn@zL&MBz(iPNebpN$jY8Eu{ZN
zMPe71t$m%gwJ|1k4cRk3|MA3_wuLHNy@RznCipt>6(vn2oHZsY>nc@oa$3@rt_e{0
zF<)bPGg)3~vd}Zx&qi$iFyw~kcQK8hOnmY@25?B`V1Is>B^GC`=AvC`S*VsVKZm**
zIB~X8Y1R;uTwY$BT^4qc-z86;sjN-pcU9gh^1B!&xlC@dwNm1v!%l-Oobs5~O86Y%
z6j}otee`UPq$N@*O=K?eyG|AQ)$Yk6zbg|rFY>!GmTM9g>`O=~^1CSZK`A2?_fdA<
zPA+XL4XGCSUC%rzuE_5?F3pSL7}q&bB9l_e!2*t06!~39Lq?3D$nR2FdeTLiCW;jK
zU6jC9<ag<8OXiPRm8nqVcj-#m9HuPtyY!irBEPH1??T9{$nWB0H_o~#^1F)sE=pjM
z=}bj_SCQYf&IwXQewQIr_=^0lsEpt^tc~`;+E&oLb%*j|kqLB1UDStz;aTs(#ZGy=
z$U;VLtXQj6ap19zYxEnwt-uG|F(0bD-nCj$h&Nw|w+PA0)4a$skO(l8uKJnAS{V|O
zA7cg?4ykB3CJDdPmqC`x<a!<f<srlKwf+;~=Qo4we|Y(<MRoa1SR5nIoAQ$hlt0N1
z;{1G0FW*XZC<DjEx=0v;#TD)2JTg|dM7_r;3-<Gx(Jsy#K8y(;iQ}|DmS8}nA}6^3
z?J7XKURcns>Z5^njT3?D@SPzAc$0LlMAnU!#INP0@L9v`s<mpOIg3oLdVO}WGH>J0
za&vC3(p;RYnJWwR<=V>JDRR2Ll?-;iyDuhW*Eh<a+0Fml<93Z;)B?7vKdq~UD%-hJ
zr%(b3yoHrXoVg}2iD`Wf?!So>#geY$1YlI9W6cfqZ3jl9=rHUIBXa_|$OgJ^4CEI&
z66han`#`FSaR3njCaQ5dMu@YGrnhe%;83{WQ;iC(<G|S2iGp5eiJhHke5ThlS}xA=
z(RT_W;CHw};+-AVd+8XbB4yie(m_wCyxD`Nk`Q&O@5J74HiQWn9y<6+r@H&e+xB;M
zvg6gT0<YJ>MMYcgw<tdF&<`MU^g^Z*B&P+xc6KHs2gldj-FQ;=24_Z*E*Wh57#+r}
zU;q<B5TnDkfPbhO>s}n7T0Oo@8yKe3PZQOdHX@w7_3Ukc@ZbQi=MsWVaEhbSeW!3W
zBj^LTS$>d*Df=Ffer|becTPXW2V&=>TZ(YD<SARA+Q>uam_~I7H*-&UawhI{XR`P<
z$kR(;j%EOo2Iw}^lQ25esys40i8G&+5^hImG)rz-UZ>xZ_pT+MGUvS|Pp(ZqKo)EE
z9C*r~Ym@YOrgOkcquPHXGws?WV;E!`$T5s$CbR@FqoCarBX4BRT1)vyQL2x&5`0Ah
z8FXNFJUzG<kj+6xA5M46o+o_;8_*~i1~?k$s(r%vTg9NLBjvfH*K=wrvy&$u?wYZ)
zYCPT~X&M!KokWo4!kjqLosiSDYS1j`Yw1+2@?y+QmA;9m3H>HSKYeY;6rlw*MMH1m
z=~=~clgpcWmN{@tLq)hTw8Xo}nkIQga!dgi1H+-s7>g|6a1z<HwX{u>3H$J)7`8)?
z$jshsbrx>m8R5@#O=}l;SjfLt&=@n+9vlfceuTs6c632MRn5LgrZm#IZAcBs8QIIL
zatAOh4^$K88N@L}(Sh_U09%2R%S4C3Dv)A^|G`%U931Wl0c-&KC_Stp+l+cRzYc5=
z3D$H#E)F{RO;q6;1U&-=7^}q6t4j3KX>-ZM3Zj7Tk?pA)>8uJPPi;srCfa4>J_G3v
z2!lR63kJ?H_DMPdcLKh4K(q18#Pa+Vk(4YIBzQ=KsC}_r0{jS_+}BtlGG_|j_v`~|
zd@x+Vv!D@$hZVA(VcYO=+Xt2-jb$HE68D6FpJ*>q_)aQl2<S06ZZ};D6nF$hv&!?b
zx|7-El;rKmo2u!8t*Ym?w6xuU2*GS&Xc)CbBZCkar^Y-epZo&V>J@BQTMF)JX9^>y
zdb~}0DtuFXYyjP}EC6-$lrz4O8v>+LBe6*AiIn3{dkS9zne_u6T1^yC%7ATPK}xU!
znm$6!ka%Uh9j+bo4{|7JzPBWHjLL!Hus-25Z2QMNx<-P4mxhGEVs{+UhB1?j-O?mn
zj2(u17?p8duPp)saB-OsHK7ls%c&wbEem3s`B6dJ#KLlX7nU<-mpBXxz&^iA|CeMC
zO?x8fd9u`qpncUp8IVhQi4~yMmFeMRhYi^U7<pMNfXT<wr`Kr+G-xb11_GH1d^mc6
z0_@rL(QQDR?E_zvkWchX<P<o$G>Og!y-t^&UWG7QmBXZ-AQl4SxF}AL)%`9qii7+?
zqtStM7RK!m2)3&Rz#XLA06Wm7iVMzX;4%ET8UWxBf@XCGr!RrmSZM}8B>R)P=={l!
zahH@bnl_$AK-Y$7Nw#Ge-;+OQ$AU&2(kuedO{C6q8g`HH?{&ZNF@1jBHwc$&&jo-h
zK`PStBs(*28OfBT54|S=Sungbb-5u`(2aUOA9zMQL<ErlFqObf!C=&@vxqRdqsI@L
z2dPu(VN-xkF7&Y73i0K`o(tSd0SL#Gj>{0F1PSb#p%VE#%}FmW>fz8cnLc_m!D39f
zgYxl7)})WCE5|)ev&le28sd<43<z??aEv-bh`DMtNVvm``eq3{leQ039%2zJ1scI0
z^rd-F*Ch|1W~no5GA7kFy8tY$5K;{)cJK~Pq7|@5W{4e+ej6?ah!X64Fw{u1F{xS%
zFf}WHMBasbXU399qy$_O;*>R;bSV>Pd=^UF@dX!OfYd~tg<VJ%j1i$+3PFIzG+^o{
z>`9>@cuivRgu)2eX~dcsm`(sZE2$VLm?D`1fhKhcWY__66GPrDPzV_xhd>8aDH)|z
zVBTq8r*|ja(j7r#2yIf>M4`$^+!@+!9{{J15RRy67Q@-sa+lGC0T7y;GAFrEN<&fx
zdcNjjL{}VP?LvKm%yR@tPLwbS8${IZFz^qXn?xSZN5T-oukO3MkZMpi8C03R&D_SE
zHOjIR4by9jW@t_^3L+_%BzA2#g3+zfSfN3xi3SKy7(4`wr0nzw@42BKYvdloi>k!C
zmk%;xOD<TR)iQLspbK1EscmHhG;jR`^_^jbDR2+1Iuh?Bfd~$c;?O{|;}h>BF2Ov2
zj+Jn_&<SP9>6qYNAyp&<ElYg@(gP%U6C8t)OxaBm2ZUA|sEBZ1Ks1A0r98xI8Ca8n
zo_TxgfKmLo*8yQ{An&Z(X6J6IhIHZ4I31#@K33O{sJgtoGTY23B0I$3Kr4dpfw>#T
zf3$CQnX!m>u)asAb_8)$?TS`hrJ=upUE2(I&<eV@vx8l|VFMqDA(-HVph+X0r|kg_
z9oWMSa0ym=lNBERY3u!~@2k76%HMjifVTioWsa4oJ9166BD>4&;Og}rxlQiC^dW_t
zOa|i?gp=oC;>*|XT)hP&6}SED#DCO_)};l49%+h+DmYB!zRos=u|o}KCyC`guNgLt
zuo*z)%@CbcC++~17UKab26{nS<7qWbJ;D9ZmHnD#ZSfuANaBCyN~zZ;f>_Ut#-pND
zthFl$6dRzG7U9UN9UMY~7n;&{Xua7uw+I)Rtu`w16b(~-kO)L;jU8@*uT30CAt}QM
ze}9kg##n510{~Be-q;~=%_d`h-(yLmV9+wVN^aeu$M6agY8<={u!ZG|Fczexp(C;n
zJODf$QX!XD<wnZW!Trrh64mvrs@J!>X81Mpk*4-K`G*LtQTf;act%?ZO^)`ganIE2
z$RLv2eZK;Qok8y3ikd{4X3qHm%8nE+xTBOCMIuhm<4FCYU0<<h>$T<9%B;0CzcP<9
zuT6WQRcp2C^R?RSoH@HNBb$z3vjf*oQ&jC9NFkh2d$caf+DnTtw-K(9$9PB%6VFJ@
zJbi|Twu!``B_mIICTXLV+76zfO;#cM(Iz=U(ZTHfR7r3=<CAXBFH9SxJzBm`qZbco
zr(g+5AgNa@hpAS=ySF}wj52S`=or!V<J;>uc`X6<4=2h|lVb7R%TrqP<lJ5ZNZCYK
zrQ-Y8f+C|%AF&%+K&1&Abj_HB>W~@cthB++(%+z9{AQLeGpi~Qm0h`+NeKY&PKyXT
z8qEMXz-i+p`mqkfW2Z#9P-tJo3bNsklWRu+^$!~yCXBq)G+WX!lCD|!Jku}Ib-s2F
zUP<QSM&>74+xJr@$o#a1+Y{!<TyM}L@T!o6!lp#70WF-+y4y;;H;O1VN>%be!d(v$
z1#E5X?bP|nNy%q&TpPN=g<SPJlS(wmchL?A3k>7Hf{qJcWDQsab7^@PCSJUf8uU#q
zJLN4RBukbv*$LCktDnJ4N%Ve9lXfQM7-a`!0Lq9*AG29D;o!hSsp`ROOOjTCn(je1
z)9wR}f(DLOLb)lQQA!TRfURR_Ckb=V{<#m4$s})~ePEAUPy{jMdOfk>In36R3rKgK
zTtg^kg26a|W+h=Or;R4$9z-C4d}}K`V*;8+6EU2A+)oTJz=^=f68<x*Q>&30xq^1o
z6zCbqE!EUReZjcN8sfTgfx<#Gr2NGir_sxKr}=Fj3pT)i${YAv76Wn>0Iwy9eYuT{
zr|pu&f!r?L&pbeSH%hN$u9#leF;^j9=vuYPbl)XJO6+GOZ87DKswecrzxFGF?1kF)
zRjevu-r=tI<t8*eLVdjo)i3M5ByY}1O(d|HbB&~(0gdqlsY)(07WVl$m#DoBTkH`~
zd0uYL%`8i~*~s6uy-kCR1&H&vIki<S*AdLZ4-Dgr!J*4_SkAJ35kMG$K1OenPF)&L
z*dl~+FD-ZJ%+2UEtdO&FLWVXaLNY5U7ZdH8M&%Nsf{<DOGzIiH0&vyD?ay$kANHVa
z!SZ17K_<P@H1r<L_6L+4DJYYcKcuF2Ncu{ZWM@Y=q*Q&{aNw1L1tIpZA+zMnn^WH%
zHn8c2`HZGE4-lI`$7W}LVD)AYToc&ZNsZ)c3_MEfc&)xT_yaHJwYk~cFPa4;N^Q0=
zJ)#Yo7XEqIh^2Dt$yjk<U^GvX#2XkA2(%2U?EseVpjeYKOt&QE+8i98)Vdym>>G2P
z*FUKK>z$LI{(vy*nR@{=Y?De2=_&x9Pum@AKUzttrb3(K@CZU`$I>I(j>VW3R`Gz9
ztma|;>{`VI3i1mGTsZ_?3ucrRx*VNcD}5HwB!B7rJLlgIlnMS%o6fKDWr9CNnc!dN
z!GM41{09GK$^`#Ll{o$9P3O0njN?z8U&|j-z`|dtJ9D*(hrM)u=t~b@hJlkg7Bce1
zzkT|zM(rxtuh?REk4PN?1hLMRFefEJTaiZ<#zbX>sRNZE3?+S9Vs&C-c)IkMh%66)
zZzdp^`Rc}FqVFB!<H;mJTZJLXVa3Ja$lb$M+L(w_wnZjNk#!wX=61T=p2_<7+0dmq
zN*+9T1Ayx(MV~sOmnL8+|Hr8!XBl3#n=+~7d|^>;X&+5KxxUC*E^?MnpW0jGEX&fE
z*}6f7EV`LGf^?Rn&OXj20ZU7`$XPCOmLEB>oUCzjqbT-L&d5GD^&l4(Im;-lUgRvt
zD7H%6qr*%^&N5;OI^0p@EZeL7nIx31!v0{L3y?{YI6m+AdXckS<SZ9C%Mi9$L&qd~
z7dgvC&T^5nT;wdP^D`#GY#iIo<P#S;%d19_vpf)(<qKy&Iy(Z6xnI*Pa+Zsn<sxU9
za-@r#<sxUf$XPCOmXXYW1l1v^8b!`>k+YmRYNu4d5zO*riK6}4nnljC8Nr86Ui2bo
zIhVFu<ScVaf9|vo2V07q<>Wx5l2ij{VD-U|&(KNUN61-zM4&`{yzJCjkV!Too!d&%
z5oxZ5CnTOWG4NCgdin$k9W3GlEq#e{I6(Ep<$Uks9P$Uqw#;X2asFkdaPXEO*PAft
zoF9xZ8a^Tj1mLt>jPp4KW_2$PR*h1DS$%#mtIuZ!`R6~LeAWrFBXZY&KR)P+$yH-g
zXN(gp^q6Ql>lxVJ86x1d)O-#p&NPF6G4&TmpB$B)*XhqM9~`{$(o6Esg;!sCDf8Qd
z622e4y*NKVzie4^6{}u1EAveg*0pU}6|vA<w(E1v<>lqp(Ss?v_pANqPmUf`aQ%~`
z!&f=${OIs2D(PHzIRjA5uj6+3A$gwS!AGFZ`KKh!n!hu(nUfihkcsV_cs}D<CV~IU
zR*?i=B!PdharF`uq&Nt?2C0Q>rT%53)h9n*ySD`;>I&&ne!KzDJ6%!=SE&usum&FI
zrU4bw1BSr@aAJD?ZUk%L^0jw2HVL_)|8?^IP((csz)O#MUcN{IFOtB48$mJn0=T*W
zt}cM9qXM|P0Im)J&kpz*g|3`Mekp*f371C=45L$tmI=+nD1fUeIlKU_zP%0|R6{Ek
zz|{qC^$4`@A_=@m0xy!lizM(O3EaOLSX_&yfoci;c!fn0c##B7Ale!ej*hZw?WzE-
zrigt3Tus26`mkyNTn!NT0=T*Wu4Wj1E_q=_E^zn@;A&ZcOCi5&P^1F5x&W@`eCbHl
zn?#91Y$pu(T>w`DoV)<8wsujbtpKj3%w}FG)glQTfa;9eoi2|e3A`xhWSk<~rAPvo
zD6Qu%3H%XL@{!*3#7XR1$g#SD%#<>vr@%S3R=W7XsnV}>ap!35;}WMXB73>im%I4c
z#sAdBd;3$r`r!!*AS|jcl%Q*Ci-}H3Xm3jby4=5<ix_B8yq8?H1rSy|s-_+JZj0i*
zC?Zr8?=6b=a`ooVDi9XcgU!Z3`xTJ_C#k?mDsYktoTQ?*Yf;;^sO<{18CJyz=$jWB
zCuy!vPw3B;QlZVzbXlF^%B`Fo@?p=lc2SRf+kenQJ?p3|EEG~kMJ$Ac1Ha|A<&!%*
z4E&9<_dsNDQ74$r_e~q3+SX_txlw->ArUf)wtbW$Ms5$4HKhvOs3*)Ni!;?x;eya*
zQB|dST#0(9u@09-$kpegs&Soalv5dYstTETdf#y^han!qz}xoaR}bLjCh88G2O^X;
zS=E6ss*O#>GhHfj9_Xs_8e?hO|DLh4Go*gz4L6GKq2ls=45w*XqKi(L24(%yAx8Xw
zD$9|c3aYrQ$dK>HEkrcGh2F~5J1GLg^9f7_8YaV_cgwQF^cE&k(T-=R0rvi(taI={
z$ZBZZzp8##7WN$R?5;Q%{iH09I^s!L+j`_%gI?V2#SQv>L<6X;Oyz8;jyc~uj1KC<
zUB~q>peoygi_oeGME@~w<fr25U^W+_b~|_{iigT!#jNx~)1qYoXeY;wunKSt5S6bn
z;aq$fpXGG9zJ+495ja5Axa~z|Tb2P2h22BVY$Mjo4y#Mp7-7j7L&~mRhFf<;zi-~S
zMGVH(=+U4bp$u?`s$mmXpiP?Ze2a2?R*hpC<=%3hgX^2#;D|CeY*GL5zh2aX83x&q
z9dZoj9+Zz}&oimMch^MK<ub%Vm5B9BzulwJ;P*rTUP0M!8qxp^NHx__aT$xNZNz4@
z8}U4F&2rRoZ(*6Ec74hykx@aSiWx<2%hk`SX;C-vTuI(ei=p!)BAUjj;Sgn&%}&>2
zWTs5sAU$`JNI`SF{OJsTOlw+nnu5#RM@Bj7K`@w+@vi7}Os=B-qkZAe8ujYJW&T>j
zhbevyI!A4S!-kD1YHd-<B4kpx8-($QY1`RhKG|{vh#_%D7O<4gL>W#%tJ1eSJ3Nfw
znwDp_4fImUZ46%s0q0i5a}&`<uDo%c8=t)1^7LdzI!vO@@Nv664+=U?p2l3!_Ou!f
z-!dY%Lq0gr10B?*R{3$664F=i{kZbTLppIsV8(3Pj7DulEUzPkdIW6{B)lQ6OpCV*
zmimx}bX9jcv-8Nx@~$6~Si!2Yu-DM7y0u3=6hDtKJ<cJacOfGi%Zs-C1odm^2b#;f
z1p1WB4p$Y+<f!pf^f%6GjA`tdp)2A88k<yd$W;I-PJ0C%{=Xq&l60}Ti^YVp*1;}`
za)s}NyRq-~Fr+!Uc}q{C8}fC)9-t54MJ9)rX}{K)tp#>WVXs+T4vce(W*bY*X<2VO
z`7ZbJ3&!Q1>)Dz%{d!W;c>3kWMm$UP!W+y6IIx|xQl8T7K%N-3I+h6KDP~QBL}QxT
zL}bYAu4BfKdFU!c5Z}W^H1RXN>4I@r7_2k;*wjc7LA%&f^UBITzN5;#B$MV)s!A9%
zZUN({_q0fOtxB@FC)01Cv^TvT$4-y0J3h@qI4>gfYXpa?xad7PEj1=t$R-XNiD1Gg
z<QKdYH?lfA`j=-xxrIs9Q1(Wt7kDai?PR6q`^6OTSSOQ|<do6Cmj~3vOa*!)sXWi~
z*|&um1_gv^H(e#TqE)GRG7mI(PA?{lOV>D(43bG}qx&vFP<R1NB&j^KQ#mHYlPsdk
zbL>j>tEHwoBehsBCvAr+uybDx;||b^S3}WS-DwIh*uSXQuEAOn-Phm-BD|NFOf5)k
zlB9Q`jb!DzKJ9<8RDq-%1ZVI#lK$L;iqC2u(pfR+2mEON!b}a53&t(z*;1Elku@MS
zDH0L+Dbs2_t3P@P@>^-zFI~C>B?S$Y)1_CHoN%vpze+m;CJPD%4g$OH3DaKJ`fHgk
z8%9|w=~Ju54K_2%uHR@0GwuPlWNp5-vN%m@S^uS`Yes9=!O?<SM>A4if8u7O^8Uol
z$fd)2{1dm;4WdS>iJ`gMK@4T{RuC~Gn%=cUO(Q{P7`!@2zN9l8!CYnby1_DxCAbty
z`@(HIF?OizT&64Zapw|PfoeYGG-HPBFz8}j(1EM+Q@OiVtJa{R`CSisb8bG3N8Okg
z`Znej+iEUt`&T5w&MFkc^dy8V+jWUl%<EWkylS9A{D?Ob2;@XiAX7zQkINM-@7MzJ
zU+6-&z4##yln(i4noSn%knI+G0&VPx0VdbfTxI!kWkc`6ywz&e#xx>`HAo;^?76Tn
zdh|6VvTAIQ`U49^>G!;Ou=GmavF(#=A*m+10X8-C)+BSTgAb4^a_d)B6q!#T```~D
z?Eyw%w-<s51^foIe`2A#7C#KR$Tw>OD~doSJb5>4Cz4{LMbXnl4_0r=o{8EbKMCnP
ztsVrrb~A)<^mGLTC;9+|&?UPegv{we7p38q`V$N%1?v#@e3&W>bL>TAZP1QPnk0=P
zHG-{&L_QR`%ytX&TM2)4TZL{jU}=XTHp$4iXnAf!f2FMlt|jOb!e-|ZxA%MoPMIzw
zk_|vh32V0-K&->IPQNs4H(a}z|9&a=U>qPGg%r@R+BIWm;NF?^ce(#>Z)@?;_fjQ5
zhrN)-m~C=-fIcvJ0*9@llu|j(nm9RmJ3IYd!L-8mwe^lC*E~<T9qjuaz3C!-ZRoFh
z#V1T)HZhm*nt(BUBQmjez;<j0XJ{b_Lna8xWONXFdH~-+EAXHm@XE%I5HM|i4(k?r
zC_HvrP}w3j;o6}kj8?Y&OvCbCY~a$gRz1=Bd%rMCAEcMsLUb!Ot@ev-Tt_jOx{=-^
zLY9}gWL{zwwYsQ7yuL`mgmRLYA5d_zb{&Z!mQ`h1Gps~T>9uAk$w>Z(%-CC9HR@Kk
zL94|xp-5q=Kqa+;?t#HAft}#DRmuPnqqTfVNcA)gl2nFgj9GvMu#^$76+@SNlmK?4
z%?i>R1X2PRtM(wHLpGJvy&yCp7f{~VZwua3$$thr3vH7sNe)xXam60yrg@-dA3h~X
zKFx!82tQ~lfMCGoM=-+);DA#M6UjbiH|6FGuzYCZ<fG|}mB35KT8}e&OrGOO`2dM^
zUIS9I=9_jVf&VV|M3z|Z``D!tuOBuh2tW9pVtCAn61ntK)>3XPj18?AJ0p@k71vAe
z@Pc+5K15m$ne(0G$i$e*<_FfB@?H8^H*)2{Z-_>~Zmjx9%Zau_FcVmpjXI*U+##P=
z)PY$c!l^!Wi5fk%j`9ep&G)Ao_sS)?5y6&~OByM&o7lYJKrELaHef~qh)J3a`00+i
za?1?OPE_V?TeJ_>wu0_0{o5Vd%W+Y|3<ljs7mpHLkH9a`wn-PO)#|uLzv0^oUe_@n
zHdtmyYf#ie(G@W(osF)5b1yzvt5<7NlB^hR;0~IkQ!#vnaS$a}(Phfm<%mjI{@?`A
zW!5h8b^cr>DCGiCS%e1#qVi?}4C{RA{8~c7rk8%DFk(-{d>w&n{Q0e5Oen_Rquqri
zi9`-#Lx4Aal8pE%ibXeCotUUs$3)=|03rZ`05XqdwJ$6YMHJhB!*v(23~0m%`FK56
zk`qHo%WUJun8^NPY1i`BQk|HsQaCWH7|zOFXH3-1@c8K9axR<+^&#h2t4_>SjWLmV
z$+#+gn6&$`5tsV5pgpPx5dBDQb?%etbsh3Rdp!}2iR$lHC+0EVTjY1k=oiXL);*Et
z=$J^n!0HM1PqI1LywFN{bz<T@is-xJ8of6#joNWQ+K=)-LQIWmtlm<J7KcM<_A)Db
zOeEDHkau==WK?6~6akGf(fQ3Z<dF@ejES1BlQ(^0A{V?F6Dif%BxcnYG@3}=)@j6O
zOjJ%I{xL-ZwgtA=_Uou9)tG4fF6Q+3=-8MDn`vU;;Cef4NN!AIzAdF&CKO0y_r(Fn
zMBDctA^tcf5--Rw57i-&ks5p=M^ZNKIVRHHnY>?>EJ#ZIs9@`uh<n@X$~Z3EZzF@e
zW1{U_<jKqFz|t=`CUWZR$u~pH<*6dt6W*8z{U(L9q%RG*4oMh&Ow_!d4d0K6hHu3_
z4M@*J==feH3u#Q`d|QdA`@E^A!B`HrkBO%5NTwbY<sTDOZ@?{r<jS0`Kf!4QV<M<@
zOgaMuLo%0RBJWKZJfdJqe?Wgs6qU6%T~3?K<B{H&d^X6K===T@S#T2*95|$0eW}E9
zOpm=C<F_)UlPJFt5@~7Hs~kxgh2I(1w(}>n(_}L~chdNtOp``v1v~mMZ`YV6|BnYW
z45>BJ=<cNI(3q&ZfCvj^;UV}BO*TUhU6*n2oiUO2PCB&+;$_04IgM>h#C@M;HJgTb
zjJ`G|62F^TRB-g^TC{8%P#ggKHzvYxT7vT$kkx_`Pbz~2!zsO_NO4FVsbiwB29<!q
z;4GYxO_q>G7l%1g$3*8_oSC6AucV)lk}M}^c*jIi&7fqZd}^A^AZw@utx844Wu?Dx
zZcKE(F|&MGrXlC3!$v(Og66b>z!?KHh{tiCOu!ivsVh?P%V!NrV=!Ge+J`77Tb%_M
z6Tur*L(4yqNoOo+Y;%-RMvOYIHauHS#-PU3fo~!=uWx<RI(ydCWXhKQ-REBV+)MP2
zs_^Rk2LE12!{WcmmQMW^Zaw^6s1J>xrBXPKetvX2L++=}Z%2mKMeipT3cQv}9b}tT
zIMru{Ese@twVusXMP|pnI|qnbck=bbBff@TF~f9VajitN&j+~-M|fT2ieWq><T~Ma
zL<#vKADmc{T*b*KaW&e*cHkkMh;$Ol#6l`1lC+RP<i<bHFV!{5W$wvsPVJ0Do=i$b
zq5DmRB8ShMox<fiFVd+K<YG~ZN|kJe1}8*zLu4|ANL1;+RJxRQ_cY}_FL0;njNcw|
zL6P^wo#G?07#T|Y!j4S6L;l$0#jR^MZ#AxO+_-jWa@*HSiL{7*GUzF`Ye=2chX&lZ
zTAI3#e~av*IG~B*X~xi_nwv_p;)?bVG0eG*4y8S`fn_RiB-0I$_nKq#%j$9!=7aEU
zN?YO&rAsm$YOr~*R+7U-zec!lk64Ri?wnO6Gov1x_v9kz(&?17(zS=e>cx^f@TN47
z58y-Mm?z3Hq6Ub~gnS9w4+T!r<G?R_nWuI1iidRS6D;qM9UUlgi^g&txr4LkNW&vv
zhf7)P$cClU&xrM#24$>M0I%BLLk;anujRD3%pmhnG^n)sS_!#w&?oy}xrLoc9uG$!
zqvZWnoG{`HQ4`1ikunE|RdT{Nf{YWC_6Ra@I($C}?TDy4eq691xnlP@C`>=9xVUpc
zmh%3qWFHz@%KIrMbd93qu!K{gbfUNm!G}D0{I6?I*e%3$yLJ`XiAZOT_R5npN0Q>d
ze*}t0@02GO9zU)$`{CDdcyk7+-#E@3%~&7J>>W5uXNkT)IH=q8`os3}a@~s_Mv>!N
z3l{Q3YiEhbueSY}t(myf^;crA9Y^)EMBJBVKKyC)qq#r(;SJY6OLTn)=jBO~R<S51
zr<B{)ZdDyDl=9Tn86x$;1*oD^f5DRS^-qq<4@&rc__kQI=9ioEwaVf`ZN4&Jo1d#R
z=UQfEaelsOFSX2;U27dZn8Ljers+=we?B>SFhdXj)hR{AkkvVALG<%rX4+2}=_nOM
z)H6vkQ7iR%DXsn~z3HP@aMJ<)WcB?wpzkNQAZhkVBv!`eIE|@s`d7oO_}{)zm=!Oa
zS#eeoz#qqg%OZp;MR}NFvF#6P7Fin&N0B&ySx!aj)4&yfoE~Wtd{Q67vA2ETFG1x9
zfg*yuSxYpL+>P@tgy{p{Dk0C1NeuNi!g*afFT}?d7x?(oMHynhB##ksHe_=@G`ld7
zDUns`V1!(|?cd~J{ALhe4{)gLS{Mefn>TS_imx+>mTi>Mr%NDgMo~6d7^@{Z5mOke
zg|S)~tIi{x`59#*o*`pZ3(Aqk>KRhN55J`>=YDngEYbM8u$)uf{47!SX5Mnn8VYBM
zOtPGZTP<ga%#|U=YHGcbixAdpdO=49&V@<4XwAW<U0J9sESO6$X_sNrF4*<Tq7d`5
zO{*yuTIMK|_J2M@CT%+W$Ii!R%J5y9C=B1i@O@6DuO7!tEDYbXV)z#B6b{s!;7-Xn
zSkx|)xsuuT3%5(*cA*e2#e%QANC+g4Bjj_T^&&^0fLi(OQ;f`iCljP|eo5(Rhc6?S
zB9n8LO;O9e_({T5BT!xz3^=%vj-fJnkTGJr7C?{)09t|A@1(Bh9p#iB6S03tM<)m&
ztVN!3I^u!v_@C$Q0?TzwM4`5~uuPSyZpgu+(;f^2qX{PXNW=;VlxvF;$RA>O$A)GB
zv{*`*qqIOp7pIx_lP*(ayKYw|bWV(EeCwbfp_5rr9#TR*S|G%Qh_<w3c4Od|2!?oX
zTubSF11Mn~7g>pv^h}jLBtlS%38Y6oVU}G#*auS}gH9$;0WlM(%$&Lvj|+Wi-e?1z
zFq=gNyhd4J(71Bv$1?qr`i2w;#Tg%ui#OgOjJ`>7{7<eLlX~nXrziM2<p|RCF)?nn
ze|9xpX__IZ`<Q73y<?*Cd&VSZ@zFqOdmxiNJjvvJGV#gy4jjzp4;%nkG32yH&SvIB
zPJ-m2M0w4g&m0!)S>}G+Mb65ZN-5tbnAUwH3f>YS04g84sG=Z`C7`|pstfRDEbM`l
z@*<78h)fr0)Ib|8(x}TXTx7cT@esAcnA9rA;MRYZqJs;@WL^hJRe+3)r$nekzT*JE
zS?a+T=~RQ5`~<10l<4W4g*5dM8Muo$t+zb8GFxfQEiYH*g|$#wraud{rMk5;x3V<9
zGAqNChhObyn<JHvd-y8rXh(<HJ-B~<cu0;uQ{lqNvlcFVTwUp`1~Y~oJj#Z^^bw-c
zF>-_X@@d2uh7qIFHcxYuuzz*&@iwWx8gqc7@A<nC@;EPFdv{}#Op4+619co_dZ_3S
zoygo04`cYt>Gb9X&Ub3$a9o8mJ;s05N;p)B2aI}O?G-o+x&TL453godc2>J4#an6E
zFv^YD+UzVH0bUPr;0{L@@8D>byJrG$c-^-Tj6V&0R7Z$*-D#lI`f!c?6maKMITW}4
zFmQaR>+b=dU-+n-AgE9&L-<o-Yda*L8r*3(yoIk7oYzyQ{Lr=;#;B-hQ8^bn-H8fP
zZ9d0UUf-DFwmSsa4?qMD=fDVL9(|RWuPJ?Opj~?(bytFLmkx5q#``!GxV}MWkl>2K
z2MjHrBjZ|?WWp!iVf({GAr^^szym@U2XI1ksHLlP#w|pZTACMnQuZ5DKxW|<y>;L%
zC|TfQ+Nh!`TDXZbyy{sRA%5;ohFYdkj~hMkqaH9`_(oWx_!WoYa7x#WWi=Qe$y4z~
zc!KUyjxdnrg@~*77LZH)Xlyfk0^9I>o*8QUOAPx>v$3CH@-0lq=H+#s4JwHmgAp)B
z>%BG=7M`0n>MQdrm@JKJ%S8w<jMBcpj0&J4Zqk6RM^ONNWLFt}UB`{^TN~xe>+6yu
zX-GEtcQ9?{!5hMC_fWQA^YV?U)34#8HbD^?q#W}iG6<{#>>D^At+)Q-^zhr8&O3j7
zRDLE!7(O1G_j=H)eyn+$9;<tv;mbc5Wyy}ooBob5d6FhfM(cAJtWnwhBdk<0Z!WZ2
zb-QBX;4-Y#nptVC%rz_ZB~f2#nR88RVWF^6H+xOzpEsS~K6s?lod4x3FNjCKuu@~B
zN5Qa5pl~jJ@WIB_OJwTdhX#X^6jo|sr509dpL3(IQpturLsn`b$_r6GJe*%R9LaxL
zI2=3hjW%3+wck%&i1ITg%8NXkBG0DCvnldy*a!JY2oS@LBEWt1Jls=5UR@rRTsTA<
zIoH{p@15D~P70VITxY1TT#?R6C*jUx&F$jEo2%e`3sl?$4idqsH=SQQgJO3SHhS*h
z*^?C)9~Fu?LqJ2-M@#2CLs^H1HLCMCDxH&ZEm)w_nWS@*GTB<FEiNqDc4fAPh!l`Y
zYnA2JN~>a-%gfD`r5X@S?NbM%zBVuhpRF9pSI%sX<VhOGx%W{Ha9)uG*`EdZq9ll^
zB+rww^)kr;#Qi#jdx(`&Bzb;@l05s<CeNtPwa9`jvLK5r$W+%ZvLJOr)Kjz;2Tjy*
z;-h@$_;mF#xk|r%T$=otXu6usvVYw4{4vq~Wg~Yi;8|1gEa3xQJ8KT+D|B1~kRpv{
z-~u4O-RK55L&T?sOh4MkA(^{`G=#HCq!Fu|40#iBX1alQ@Sx{fqHsu`SBG?IVD^zP
znL9~wn?XHBaDWMXC)7=&GJVDs;aP4kq5@*qO(zt-BV0OZm)n!&?VOEyx&m(f<ZMwW
z1mH3H+>vY$cO6%#Y#Ox=C3UBGX;bEr<_6)eBd`KOf(}CQL;^V(ToCCHWU0B1V#%oU
zs4AIqbve|NE9$1nV;PSW5*_8MKAbt;Lp;FZXURo8ImAVrStN{+bau|sAO2Ay^NDV?
zsEEio?46G@|E9ILveH^MEAw*}{<Lb%irKX1E477{rKQ=~=KQP>%7Oj0A)G^Htx&@2
z@JpFVIy!tU8J45NFQ`#DI(&r&<*1jODB-$aFLrgT4=_Au3L)v4sW@lW13tjC*p9Ne
z02#)`Zdgh<7q?}0$rADIQ}Kxpqenvnlq&UFjq3UqRejGzp`=1xq*Lu>R%sP<RG}`G
zWsSv8oe?;;V}#Hx9b>eTSxe&Q7yr|hbDg&K<!_#`Emx)XkcG4_n(R#=ToT|I8~aYc
zmXp;BVIaoFm22;lAx8iom=3Bp%{v32SyZX<D*(Fk1TDIJ-Fp*Iu7Gg$$-qoD6MT;1
zpxXd=MT8RK7w_Q9ZT?&ZWJ)7wHOO?so|pNSRHvhJgV7onG$);*Oy7}Xzg8M@jxd+T
z)dbv&28l)6*{{@icI(yW@=HVd|1|d6N1-bjwU1MTu_0uUnonc83nl*^>?}9-7`RnJ
zue*CdqptA$(T%;(BU`|U<8HJ%GsEDtRiF}>D6QC;=?L33D{YLc2oGjt>2+#ghD`|x
z)I*QoG{arv&wAFb2%92oPaD?>{)XU$uQ;ZQ`so050Rqky#}zFjp~HB;1sELI=sygE
zXZ)$zMQL+-f^K@(3xG^wthZWtK1%Mq(~Hp74FK}U+pfF5=`)f}(iV_>(B%6baD0;1
z0F49S9)h^hw_R~U2$L(pL*3G?5CtMFpy()?000z<ePesOhqRgH%fUzT(~5C9h=92h
zJNm&bVL5)_1?_|6*&hnucUz!hQ|vRcufF$&z*r`?T?H2!cZK82fue2$*#;;snWpZ#
zScX^3*s*{bG;O>Gm|$9d<m8LiyIp{?1RWQn1;<VLtL_V76ycG!zyxR#ZUg4o2V_gq
zPaBv&H-jhw<V)JZ2B9stkp*sC10<4ht(`V-E5yiJNoS{xTY=}|`%Tw!FpYS394GTL
zc(WSjbwHmQcYqS*#UE+Xh>IXNGP6WX^W$55Lc}+{gV?nq#0)#ey5~Y%q~u<wdZIzF
zYizn!;F;iVaGw{U`{_N3qb?H=w}>L+T|hK-jdiDk5ncBxS53!**9UK-m+EnTS3)XD
zKqKZ;PZ+n*fZ4;nU|#fBM6OMek&Yy87)V`U)TLZA*Vi|0Ygi;&IuX?^#lopom+LFF
zm6cgs3(ft8TH*M`?|I%Q(?(|bKH(aB;w9g~>{LEV*7#Ev0e>!c6y8>T)DwU&x@{g*
zE@QFtCI0hX5V{{B^Nzn@sQJG?2H&<^03Y$UTP7HhwkCb=UBF|7LC}fhWtv;U4M%{2
zV2>G!JLrX!I@0O+gvEQu3A*xLZb#B0+s3NL>diIZH!0fhE+NJs{;Zt%OSevdGC5c;
z1~y!D3h-g3=Ox&`1b~%rG*;hitu|jZHhklKD-IxE3AU^b8LOE-6xhMQ@HRO+nLKuS
zwmboOhK7kVUBH%g<=T_@WmxZf;9q>PLactip=EYlkb&FHZdcxB0xr!W7cmkZAvwID
zOY4)-qC7)9gj@yc7d6v@6Pfgf4n`k9@{v%Ya~?$wfUVXBuKz)R(Vx8vq%gpD`S4E%
z`(_A7bO&%#5w?|FdsKzMFnet}I7j$qcy!MV;~r)J%7VwR$pOaC>l#he_(I>w%?yY*
zL+k}!kD;tEc8bzC5z<vT!#VD1qv&uXB7F@?2znK1*_8orztih>!3Chl!Uh-9UDVB5
z&0L{LS>VynFYqN=-z&2K<2reT1JsPWk?xRJ?)2PP02dFE7}LU&Th<dlW!$5e-XF4$
z+u$iMD}2}oDD6O7`Nog1n~S)DVX7n>dBuvT{R#oURf1N9Ca=;`eAU0g-!nrL<Ft(r
z0Fg%%hQ*Q+3b9Yy1sE4|YYe&d3YHnh7*Jz`ht7L^E7`mQwdtxZP8)ikHv;A*JV(>p
zf-0f5VK5v&4ITk!fMi6&Vww)dumk_^3eh#No1(}xTBWBk3WUI#G(b($>PX3Oz*w<E
zl`wEOlNcn5!vKjt%qtqB32o{fhe)Gx)~ygVu0tpry%0MN(I0@mFj=2fLLVgMBULp>
zwFTEf4S~Xpm5Yu-NJtt}-bPQQLZuM+xNWF8*rK|DAGu9zUbIeuD`?Y_92nE}+cQ7D
z4T*&9-u;N--lnmk$<=nlY(@!X4q^0RgO$W!c}58eA~kHgDCvz=L;^_-43;HUGSCk@
zLDK~tKp(_juV2EGC}^DnG?BibLAmkk3&|)#ON`M8uo@bto@C$5kubN7ZQs-igB)Qc
z`<Ok@4O&PEBUlw)1(J1$OQ0HJfK%xv#gENwPf|))kS~~nX)2IG{e{?pK+B6T3b3V9
zGp+e`8Z<)*lvEM>b<77CD4<UQvyTKywNHL5)@JAHK=C_;_`tr0F^PZ!X&caO@cHe<
z1p~wFaS;uw?{a{RkACQ)-bYCKN;|+jPG^*NqgrJW`YeO}{rzevdIVR-@+9pV<O-yQ
z>tjITO0Qd4T&OfXAL->?5<w6)M2}0%4&mcQ6(Rw-=(7uzTD>wm-!EoZ#@$LLnT!IS
zYS}IDMW=_26aO9AH@uNph_*a*uz$Ug0o%WAM<xQI*y65-u5BJ*Zrl#9j{wdH4;sAO
z+Oz;EVJi(F|7cLnHmc533418T9_$px;Ov{hxt02SZDHxOQ&=JG1jbY3pj68r8}HHr
zA}dD>k@IrYgRpAGGz&a4v?k=!6%0eAojgA<(xF<<P={4f0*L0IqO=#1o0}qzFp)t0
zlnN4x8KKu?IEB~<m4q;{%eG?d<%x`yLW+hEivXL38$gB7V+L+>eaq{y2$E*pR<b=O
zb_va3y1fjl+SqW&UZb^)|Nkj?T#&`pCzOymBb^S$9RaN)Hbc^~LNI`D!YqPB(#FSt
zB`H^^q;JrERC~$?)jL&S!@|3W@s~CL3Dqs~XY}Y@A*_}9%E}V0Ip}dUE!<2fN*&VW
zR6`rL?-=j%w%T@L$a&gm`C)!YKD>|}I4l6Rtxwayta>}NNE)myLc5J1yP}Nv9y!)D
z5pom;hKSxxd*wAO5n<dgdj!%7PQmh{t$4$?su)G1Ep2%ip*?KWBnU81Xq7Aq!at;D
z021K?2#Pkuj=3v(T?ovOEKu-0-%N@$-;ivRU^4KL&~T?P(`Wl+=VE<np*}alxVryI
z5ydb{=kc&VtexZBceyl>KF=nt{fs^j_hT;CMSJ_6DhelFa$KfMl3Ie`GfFQZG7U6n
za#sc6(c$k%9POj~0K|MPQ|j*U3lR6f=#7px4_{@?>d5(8_V)?C$N`Td=gq@c;SPz-
zquhzb2mj&sUU~^Hh4OOr;9u}Lb9j-z|BX{ZM(W`g;oE}x)TDQPDdVj@O4C#y{9kzd
z!Ef+KK9k1JKTWRX=zs8^aeeav{hFhj8*=8ksX=0yD*n@##F`=Z!T-%I|3CCMC4=ds
zYtNO)k}<?ROFn`={UgngJewVHZl@zjj3OubxsCMCPGpDSoSPvq4>2j(^%U$NdO!KE
zXYIh>djfhEcnGRmv`65HzAV_s1-zR0DIUn}gpPCegECfNiw5E0hg~RYB1B-O|5AT&
zhJL4n2)9FC8HjS<hE9Y3b?t5cf^j2&afoBO=C*IQj5S<^1!LNcVU?aQ7`KD{NLO>7
z-0W%PEW~A2K;T&Jb)fo^SqGhm+FgZ4ZnvtD0}XkK-grgclQg5gYaNZAL3BybbDr?a
zcviJKm3f?#B~wz4X(RJ|`gX&nZ6tmEDqc!gG!g2JnhvxfADDXT;jI4Y8!@9gtTjpY
z6(WZMlXowQp-YCq_lL3y-#)o<%AJ4=Epbjurf<a1wgtH@bSc;t`9UVLCC3drfZj|s
zSy31+imL_{e%K0BdM_3?aUR$g&IAfpkS803C)n?c9eNLO2)=`qZ1_pJ{Vrlhl;BHG
z@&%a1!fU~wEDL>-)h26`A~QIEpDEKe$(E$LriazkAP&xy)np5WTHfiO>m8&SLr;SV
z1lxrkO$d;kN~D;Nu9`IO#Gg*E&z+(d+o<^&bLESjot<re=o;tu_rDqrWflhs!8C}2
z(jfV{yj_Ay2!{hK+bRZkyM)C^Prv~TlP>CYV5qQHT(5h*zOi=TGPKLIdf_zkV_MOq
zUgobzg|sR7Dt88xO3s*l3hpI?qvw^S4L7I8ECk!P;0`2FME2AWilng`1qj$D>MtDX
z2;xyi4(N$;)NjEz0MC3V;8lU;G|=(1RD@z7<{r!-I1Sd9Vm#T_5dH~8CxEy5DKIBZ
zfFkx08C#CTh8$b07|7->Ga-ixXvOI10DDpTLPn!~y!?avQ`5!=aLVS>9k6afw629)
z>6V98O_34!9oP$pKuf}Qv}eJ0=VJQs7;I<#q)w|S((TTpq1rE*Mr?q|ibV!YN%~>P
zlLp<U&phR8t9)wq7Ag^St_s91*bkY@ZzDgl-^RnahPP<zK%*p(r3;(dKb2+5mg0l|
zgUqM@h(G<t=<&wfn@_I2_ZxCg+tAD&Q;=5iYm7inQJ13!|LW^x8Xvw6PeFvNA&Aso
zbmaV1Y7(vE2EOY=yXct{&SdbP!<ivF=7hS|G2<@e5w_@R_kcb@03AJ`eq?L^oxWeE
zFr^;d{2!Q=Oy}M<x?Vgnk=fNJ237oX)uX9oGawyga*8^4B|uOGh7GhXTDxR_VD+A>
zZheo5r=1NGbgx7`DczU0SR9Z#i6CG{KySKNI#>O=B74HqBr}OnMvjAP<k^PcuzJua
z;ks5keloig>IzKVbJgKFr6FVPpyX10yKeJ<=S_|TtjH?$f9hVUO`od{mP_}M_-OiN
zo>NxWG78xQ<W05Zwh_EISG{_p45B0VBe7Pl*QOVzrnn;{^3PQ_E|$|}qfIr0%mvan
z2Y)~$78zs|NJ4xmTsvQaWj-j$IpVzCa%o)&ke_00!DQIbma4kzp=UM)l$3L{cQ2Mp
z)EU)*1MWcD&;Aa!5t1)&(W$)i)W_H597b?xl&8)U<^2V%a5fT3FtX~84&S!Td8<`h
zn6E6(A#-DXzP4OhUNDy{a|^Bdf?aR57FU)LzDkPs96gxE*}}uul6-pLXCA&nHptOW
zzVi8(UV20Y^)xD9vdx~PM4etOc?ui-V{DzL-Hb+x_a_mUk0XK}Z&Bx#Nv7ELe<73V
zxAf;<@JHEIEaU#^G<MfxhI<~tMj2~N>e5*&ftGl4i<p%1(G)SMC>zo$Vp4X`6PIK)
zZH`(r_d~Ob@E-+pWxS$;Ou}vdCeqqGu^GgCgz8!t28e`i2FRDC>l_=jabzudx`bfa
zjEa^`!V~>Sk=`OEC8HHZOo{`IMNA4SY-GT+h)Kz4*8k7my9Gy*UT0!M&QK#llqibJ
zTaP5>pl4VG5>==hu4**d*O{hzz-BjwBW`1+kX4z5%0g9UO=VW2G1NHB2QfB6koG~K
zV-;G(YVC-P^@AhUj#$|%>%peOVJqy|^^5)B-3YDLvb<rfELnQfDjfd4^XIjy02KO$
zL{E+=bX8X7pa13j=RcS4oD=w@PRVQFY<O%}8*zEpppQ%+>dWK!Nuo+ZUOjF=r+%Uw
z$G49Io{Xp{>WRsg-cU&)faU&{SWmZ(FrS9M`p)Z986~&uF)PZLoiBVrWa*3fkI^mP
zdZq?6YCnU6JT#z<>MZIv+N2&sY1zs-;<)E)l<8#aNZUzxqq3d=06>F8c-1#aM9+sd
ziSU-tDnZXb-l!D)3_)``X(jV&XI||ysI}&&@_XLB+VRxCYOKWWD){urZ+U449Us&t
z<K>d)=O|(>^`lzXW@dos<7ua`decIJ1n!sm;mw~0keK?>)E)&gnfg(!JSwO&^&?w8
z4t6>-6PhVUL8qx7*vbejc4h`R(g4j9$Gy*tKw?urvYbNu2S%_El{*UVN&P@HXiQ=6
z^AQMwo+$V_XD+6G1l1z|N1+-z0m75|L473&GCInem-=zMse(|Z0u~+XZ%qA2YcM5D
z0274<26Bc`^TJH(N40pg{Ue0rjGI4FKde0B9j1ceq<$m|>$y|;y;DD+YSA}uodRx?
z`ax-c9{PZwICen9p&16%q<#dKjtKjaSP_By`e8;f$ZvQ8r+#$5IIjsdlm+()2OT-J
zI5gkX97VsVkRAMFM`|&lm#H7@4L@3^I5RU6Sv4a59UE7b`eA-K7Wp+ZGamPq`T?vX
z%;nJ|(^5aYH<DJ0kQh%JIhXo@enGg&cx+$lJHD<G6*Btp^oGA!s_U~z4{@eQw||qi
z`de9bEx6#Brxsxa@aT&}UcTg2#9Z4!QqlEGa$P<b>Ec4*<2mtkrM_Ik5s{tezrdCv
zrdUX;5ERF?ewrldiI~@Ol>MpwV*91`Cq_eS+ds9@{&XTkQu}l53+-1DxsmYbEdK9e
zh>UK}D#Ua<ztNtHf@ksQE9!^n&QJSy#*<4;d}$3YjYnkSrN4szyV(9gEEE=Be&t!c
z`ryLRHN+0lk>832c}%b7p7@j;Tx3T4u@X8=-WcT?JJSwo=-~HeV?~A7;jpOC?1vwo
zW$)U*8%tx>{=I|O%H?XQvb0huEbCQN|5|KS3#*HZjl$A$X|-0XwTz|8(qa4e4<?lU
zYX1lUCr6h3ioy-rzt{f#_K$qR{XshlF&Lreb~~b}&$S<iblyAF7M}LKNLTgBkEN`%
z$Ri3&X$2Qfln=rHCzP;`ZL4IhF6Q}DXB<idFdeg*Oc=uB4ic3gZZ)xp$;6k38Ah_g
zT$!Z#B<r|llIHW7XY@QsnsMx!5^qN`ejY92KF+aBA$P*7e@<%+?WW}W-X0U|wj=A)
zGwswIlRz?6oX)iVNa}-JKq|}HXw~mC>fo1<@!PL9&iq!ER3(=VpHYW?5@ulJw^Ju{
z%&PWumhGvDc)4e16GLI7J$*kYAYu4!3<xSBrjY!QWWi$!?>!_nENZ!_QlY)!mpK`w
z(-^(%S;$(*Y6kwR#_G&v(DBTm_GHS{Keu8|P_aHUETQ`MDQsE?b4MGNMoc?rX(%};
z*ZU4~jwTID$C`G|w7Z{YM<<k@KSNX9n61e)^p(X{rBuMbr9#atmkWBytQD#&wbk<C
zs%|cqTF2O$9<9dwb7XyTpIt5a^ol0WpXn+qGhZsMLbUDa59^+@i&9_fC8kd8BY7dU
z!aAxxp>o{*dbwDd!+rT2Qm~@f8y%#`TqRe;o0a)Znf}}yFY%4s<JS8_6yZX4{cC2o
zX|d$wAhTvM5p<ZM_a=&4A?rJeI_fB!yDp>^R*yNSR-KQ3Z}yC=0YA&42$bgn+YaRu
zx|@0JjQaHDG3!!Nx;vw8y?U%v$u6Ez$37dACtKVdbY4b2x+C4-k82~G;0PaeTb4@A
z((2MmqhKr>HMlKHMnPX{wF;%Gu~gH|)z<QI<*`M{PspGThtZIqnkVyzFTC)v7x>>%
z%G6V{cl|fd&ZU_Nm4fa<h81`H{w6-3m~3UDOL8iI=-COC&%1ywOyNo7BAK|jnZLPv
zEkibD5_+JgJns_vi<|9OB+k?El^x-C{o1&q7`YVT{UqWA>d+2K&d1`uo@p0971L4A
zuS@YSUwl4LV;^C!U|=Nx)IRvga*1MgLLgAD@6}QK7T((R4abHR!QDsE=3c#r+TW;1
zRd1MW6oqyMD1jwE_}H|o{^cU7XrYQbas2WjcUjrF#+wH_<O@QZ7cCefN=`&}Pn0kc
z5bnDs3P!U6d}cPn6rhfu1A`E!WWtO@#<S<@P@14^pCRo0oOlOco5nkMrVhcMrlBo7
z&(j7whpVq<lv=AW-RLNGU>fDZYN=5wtmvi1<|2O6mzu}eZ$4tM^RGQS$KP}qqYQSw
z-x(qlFEax?bw^zwZe@~J=CB(+_YS)=GzZKb3&W1Xp`V2hXaY|BwPRt}KD_me`t}7J
z1|u+RfkU5BcYgfb!?6GF%w{4*h}}HEE+}=mv%kLS^zYnbF?V2U@A((=bhEGLJf~mp
zLo0>Np^qBaP+-mVQn6Gjm1(2EZ}|4iAvQJ)6*#r|Pi8CeusTtI@>nDzy@9gcF*}aj
zURMizbkXVapPJwtA*shtrOt2t_zN#Y0D-55i5P$LK@|l<hF!zXc`7(bx25L2m>~h1
z)9}ABv)&n<M^(*fX?KEnrDF7vVucRm-;RaITpZyN1qMSKI(|l7{kHa|J{atyixPR`
zngh$EeN;y%Shwr!DO;(58t|xoN{IxV*mi0B1-cGjU8iS)<7p=FGDYo{mpd!pNkxUP
zY5n%TE3@kk8z`@CLs0SKe-aTw=xghYru=nDzYGgGL@3a(BbM0rQDm0XfJ(Esa)#D8
zD|+~})`n4jpDW!qyZ!U)h5F-6qB@Tf)kGi8xML{uI~jF~LdXh{$x`|A09Q_z-|1(@
zqtedQee>lwUU~v<`Aj>f#gi>fLNA|b?>?1UuUDl~S}e6{jY4(FXccNJtwup_A_~r|
z>b2#TUTT>7aRG22F|XIpWnM4G24VyToV%v(I(88lk-F2W6Q2OXO6*%`UtxZurw=a9
zk}W-r@U%?Q%V4h=>@}HD^KnCoCYa5%ll)v~WCi(3s?!a9IhH;qOgE&%@F<C!^zIo=
z+Sh}~Rw>LybLRx>T~XV_4?!RzO^%M;-IvTxXVmSl9q-DFB~o(Fufgea`INz)Z)ti@
zLqa9J*@g>EEVU?b*&Oyv+tbDJ3^Ojw2ZLtY8``kmz6pb$)}vL@-Gi|=G)ZcHl6>9W
zMM1`g-dO&m)W$&V+Rk3rGyq9<Q9h*EhF2;cPKFT<_L*2SoP^{AJrTUqggyjOPxm)5
zo7WuM!;0L3Gc#~+;;MKzQ5-!t`Z%@{(pa^eA@Z{=YR&f0-gVcu?5j84xV1sI!{qx1
zri&<Vx42~!@ixV4kyaEBXk|2g-KUwlWvn%8jf$~l`+Z*1Si~J=W?Z2sxHM1^O`q4+
z$b8ne{{t_iEnas`bf5{(0*k2e+jdZs)q%Ymevuu=F~DMx`s*eL6Sna!yE#Oe_Z~{&
z(O=>}rtRTol}gMuO=PKq7ou-At=7J_z3n=~LDSscp2uh2UvF7m)8%&^6Nca6PxiLA
zF@f@ipv^72F|@h{sGi=Ku-np+X-6|J9WxBc;sC{DKdfHrcW=wy-j2@~GT(QHJzVtr
zmiW!}_PZEIQTx_#AiYGFBZtiP_N?312j))Sn^mI$7#}!%XUoRyG<7?i5^?1K>y7<=
zrRzDct>DYNhH099nW$!Is7em^+m_vRoLwA4yC&WS>e#Z2MNolZ78+Qk-L*BXzwfmj
zT;H-a{HIQ;a7F_{2ud`@>0#mnJv7~Qi`}l?)2qQ#>YGZRVSQi@f|p4?4PIdRM0p~*
zCqKL3_4*5MnVw#_&EE>u!phacEgR0}Vbfc12G)*+%o__~BSm@lEUL0|Acl)Z<K4}x
z>-;TzWIg;eAN{Ima(%V~rFJ%UJ)+O`S$<yV?aS6Ej@~FlyFLroo0VuFK`@G#(2}Kh
zot>c>e^cT{p{(_Ye*7L`nB!}?ulKsa<X+Tn+1&3a)}Y1~jum9US<p04bPw5x!OrSy
zv2B!CLw_lwg$W++>t1_Jd$dIg`skC_NgWEPFu{(3eyHVI)3^%3wP>l<WHTm~D!56O
z3FGE61=kWYua*cc_)9eLCRv^}Y&W&N{Z&D#OLNU1)^}0)g<HI@yU2uTlT&KD=Kh{D
zFv8<a7~FPPLKNIqyVUl)zPq-t&>rr9(sx>V(=0Zf1;_;MLNFr>4=Tk9urv#%z0lX2
zyA0<Um@V)Rcv2VG9-c&?54O1BUJRZp#$R-KyRLl0<Qm{JYLwR%&)1q=^cpT^vSYBK
zeO15=3<C@q-+Po^A+q9sL)UB#dDIO+b-f<^`w^l<wdoF<NbKkaypB>1no!(Q7DKsy
z*T|mMeAdrQFp7wWKn@W&K=-g}&%cq5m2J+uU!97lj5=p@12e3t2q-}xb%9lqTH4wk
zxufP<04(kMD}x-Rbdj@5;Z`Yb+4n4QN}F4Vts<1cEI|BX--Rnx0bz<i;Li`H3yE1d
zqnHn-$>an0xMd@`4d-&CQ@DuMojymXcX5iK(SfEmq87~CTNcq)-L0Ift;xD1&TN0z
z_S8+qXu}r$=Qf$Wh}?D?ECj2CsUd$1Cx&I){QNRn{^S?<tzJVDn*erkYPaVwbN-08
zFpe(7QwcG{@S07~yDYg(u{F@P2PO#xhC*3bH=_7y#hzJT?5SX<#l@bb(vlt*dvN5{
zyUu{pg^l<r+pKr>hS|kaTlV@f|83bfx!#+x^{yxyz3tq+a?-O~wrq_Zq&6a`0kh$m
z5bgy|pf{YMr){&DXxRGTq4l8X40aauhPzNHmzIi5BeIAc3VA5nP6e|C&DH|j7JBr_
z(KDNE8;O)mtS%`G0>GYvF8KeF2XWa$c}W(_pAob#^juLy7mX4-AgKD*h302~cR@;*
zoUfX6pBNduZcj&du=<c#G?J;c)0Xdpq0Qa0WupMrKy&tN^;EDD2?0)RP4=@h?Dg5&
z8kpIdnkN4Vk`M^T>YIs2FUk6<AJpnz-bUBuh-B4B2s#yh<*^5H;H82E8NSfe2mEo(
zfvEtDW^KFY3_N{MXgK)c7s_Vu7q&GCC}9CFS;a1N6SSn-(zLItpW}1EnMcZF7nl(j
zTL3L$XAJ?$0S{2~Ei=Nt@cAo1LqRHkV3~VV38ba)WR@c%T(an;Z{i)8AWScpj&u}G
z>Cokyj()~`2pA@6EMma~Yhzd~IWfH8)sz{0cIyTMJ$}-<;5vmN4h6{MW-x7c2Ka)^
zUM?m&Y=;BB3P!5;tgeL`k<tUT%wdl&1=A6%WHjuzbV{;7kVe#DK_n!vi5~Z_9QQO=
zQ!qUSNRr1PPYf_}pWrBPqUy-iU^kc~XZKQYb6up>sQoQFh&kd)e0|F<#Q)<IDjE*Y
z^kcTVfX=X}hTcbV<w66hmjNQ;39e3%@K`;94+t2{fcz_xjbsQ1GOPd~*e>|H5EfR1
zM=1`b0ifZy(}9o^5J6Yphl)r_)S(iA4!ZcqgSfH_&Vo6zp(L2*2ZB2@gP(9Fq0S;C
zDUx2;L$|0%3Ou6cxSlT<(7M%!Vkd_*nhYRn;fXiTt-aT_z$}{>zMnqamnzU#p6gt!
za9F`W(3rxwvG;NZwi}2Hr)|d+TcB(wVZ-s)%8)Sx*X*_mOloa&eu`W;(0w4xDJoRl
zkix*w4yYKVJKAP*7vd9mUfV>MLm|xNqDf!2akyfBWaiO5S&xt)OuM*e?SiX8*pwrH
z-xls-&FXnTsxgSV-rX@91APw1f$9m_u3@=Qx?Qd&+Jpq@FEl22;5@W2G59HVv`&7{
zZU3=);xW8Prr_#+z+1SuvSEec<RQyB{k}s2#k@zXO#~m2M`ENf2kPOjBYP(YYRd88
znr6Tu!7kCh3r;DUTgNsRNT3Jt8}n&E>*Fw~@wd$_+aC5BpcsrgA~)5!odLvNEFq}x
z9g?f!hj=aJtDfkaGKe0)khVAMp<|*TZ(lL=-aFBG+=oSD_Eh$l>WCm>R+pRAnEhp&
z{AAdL463Lb%D=m(_hmUrJ0z9Kk{2Fwi+!`@6*={{g(9}m3Wm46jZ?jDK)#W*fG|S%
zbX(u)IuO4GP*W)sS`86+grqmWb^TlZUDwrbL(hcB1Zt0QF8>Z(jGAh>Mqk>&)tf__
zP3}SUA^W7tpxx2IFS=$|%ip|r{SK5=-1gpy|M@Sv0xnJ|VS791F?$7ri8|Ls$I!Of
zaD0)3@5`PM)d+_HiXtqqpT)M<>xvwJUW|=`QscCeX3m#1Ya<IhDruIMUKhrRfe2M9
zxjtmX!O8?2h39`DU4fw(76$5JO8pZYEKW9>FYj=_8L)@IL}vI06)i=RL?2Ks_nH{o
z#GN3@r|cWPb6|GO2Nc+ZZAmI=^w(>XU5m<?JoiMvgH9@1OgNb2zQc%N74&dC!|DKK
zMzJXDPPJeCC6oyeqEG^PTws&sknzSii9d8U5*VdMPS+T{#y)asZ_+;0-4{cjDB##(
z5X+RTpe$=LGXHf{h!oQr6`*i3sPo(JO~|!(po<P0(8_xYFr%@9Fh}_Y`UM!+8Vj`*
zlqfUGs|LjRmDXydxm;VVmCLP$vDhlLTIE`)RH^Ef#Rb*03!Octrxg49iV4TW9v<(Z
z^wP2hI93C(gnXDZBhm}Zn<BKx5JQEQN}jSzl9&5(mz-D>nrK<$d2xiGgVp=4&%yB?
zTZB)(FlvxzG>;`52aqwT`;JMcAPLn$idVviAzNr~+<Dv8@^UaU843FKox4}wlD*_<
zkhn3*^3oeu=K|J~FkwPQf=NJ89MwDNSfOCsFeJEI87$4lY1l-n-a(z$v<gIr*fhtv
z4Q^KU24#_~EG%CYQuSF>eCNg%1;b@Nc8;49LIZek&SR|x$1aI)LLr4>2N|%(sgal<
zE*x-JFvvTyTFNjI?OAgGix}h%dABm1uiuANQmD8d`-yw|U3AVj0K|S;$L*7@*ZByn
zD&!V8lxP|Vm=juu@tz&xP<<neKTAbgAYrZtiz;gOaJF;sv$KlM)Vu;9ggt^YU5U@z
zvp#E3@8SsveF^g*jE>GwEl^}da?o8146@l}6ay(u^(NvDzO{(H6X7%Q1q+Bbcm~~n
zpm!l|fF=WDQi4#n(GBLl;b75`ybc2g9*RT{A=_{_>JPx1dHO)2;KZ?i`G$JNR|7Ez
zq8&pz;ai0`cz*7IWlEPf(S9Gy0D>T18tf-gJSXXTY6As4ho&Le2{FI>kY$BmBSN78
zz6VL8r^9;ZYw0l+P}dsp;cVHrO^=V0gJaKPf)QK-LWcJM42N+e>~CzvkK94q{t^T$
zkl3n`h5DlQ7JUWUmBEhqIH4hGivMDdbM_L}X?dGyU6uNBL){?PAg$XZcnU~s+WoEE
z`*lkDTkyzj<sSH--;F&$xqD4piCxjV{kFaa{=#d;qV)S6NM;~kamQfLr<D^HN92T2
zK~StvTek133eODX!U)_af$4pv0ZETVajrx3i<>WL%{i$F@;RBeYCUXcOkzBNtA;!^
zx#l!8bRl_~4;WvZ<eSxn6~#B}$-B0;ILTNqvs+Avt^RgBf>`8%Npvwjb%_D1RSsSR
z^gU9t^G(XA%lU*Z0(B4CG9?dX%#G<2B9My>O1U<z54bBuNVybB*PNBh@Ct%!!PE!U
z5e~Sb7BCy6CU)SQ$Hs%^AsdJ)y_$mXj28U^LXKjTVaxBc>1}dfC6a7!2Mzh6J`WsN
z<)A^B4{#t0=kz&^uTH%>X=0-T^C>MY_4|D#;(1QJ*X>JxwfO3f?cVkKvuf|o&d=`3
z#>EG<gJ<6zf0X-py}UI3gKX#ZY9;YYKmz0a2xhd-Ws#pJ%~-^@9*-2qCMKXsa=fuA
z0YeMtL<=WH3uh(9#1vaoJS=EQ@wF-ppKLvtg7_N~gAac2#~+-W^7Pv_z3pWGZMgj@
zQiHo9j?m_D2i=c5DUwvwF+tbSsEZ=fWBEv+$147vX~juJ*<Xh%>D7vFY+R>Qa59Jm
zQPgww`*~Q+RQX#!A0ZCuFkztUCl7$+InaB|r4pnQpiQB7j#CY=S{qPH=cQ=d=xSmm
z5?LAV2{ub?RhCT^PpLXMy>#*Al8+K|79FEVC=g(`9zFa`THtx7uwi_EQM;y;CK#%j
zRswdyu5RxP;RVNC#A!h;fkNSX+g<i5$k-&DJ}RgoWEd9I==|Z&yzoNjU-Qo&;m^CM
zA@<2wY4XmW;NnK-|H9w)XX4lYlvgfv{vH2`gL_qbe}T&#{%KzfWAf#@sy^QNOZ>Fa
z`78d7erJ=?pt>pgfp7QAmQUZe+KGKTcoBk21U#Ltf9&%!?c@zHBjZGqBPp^!gfLpc
zK!I3%a3l{T+&zGT13Hq>?X{Vi)DN>X9`_fIyc^xAv-7DR5$c6Rd_jWqWM4BKr0>zB
zHI5JqXrZYeO)fC(h)@`Fjl}JhiMk|bv7g>Manqm3kUC#N0g#CI2veG3)0@Lz`cLLE
zoHb0Ueop25!093*5(Hy-U7!-;K!Qbx{>JqjJ_9}52Y!p;bWbz1Zj^7}%^A)*!&zrI
zYcW?(3m}IG0?jPLS!X!w3}+3~JHf>n&N{<cGs|O!v(9kVRN<71SG-Hdz^vCZob}x+
z@Hz%apbTf7;jE7Ur)M~8!k+*qrVBR1S<^9M0wtc|tby~Quby_^?F?s4L{c-uS!X!w
zq(k|=^m~*uoOOn?zTzW7BanZFvmWtt2DngoI5V8JihU9&?Qzie3}-E2vkDAGVJX8|
zXE<x%0yCWTC_XyFSradspo<J=o#Cu`atP1``C|}Okm0N|oOKp`ER3<pq%BV|`uG$$
z>r*Co45Cy{52a+VScs*hVd1n`#nT`Gr^MMk2?*7@h@E~LP|+k}OTeQsu-2vt><41v
z7@G(sjMGPxbb#kVDo8;TXQ=lK_5N5zUozBtZe!Spkf%=z_5Shi&88l0yvAyaL`nxJ
z_v5kPasHb6v1t>r-~n4ba|pE`HxaK2q+&Y0@mWMZ$e)WS2ojG4`7``-@Y+gasl041
zEfq>Djas2*t|E(RrMz0GtTt8}E45~AX}NOP`E%a;)1$E5!_Hse`iF-HpAar^c<^Z-
zA|Fi4p&M-szE0Yahx~|=E4bMHgXkuHm)?#=oIE1H`xId@XNp4qi_Hv$o}tj+zIFXF
zBs8f{$E=~_>AC~?4S4)4+qY@<BsE-??Hd`X5T+4kPOoR#zEuqZu?7X{Dza?fS+?(q
zDcNZpph1}kad@}=G#*D9nEl(5#RpUi8WPd4U_h4bJInU%qh+&f-w1opvVF&rY%<ZK
zI4FX=ei;f~!l_}V3}%E$`Baj?k-+5)g`Q>m=24A3bWDo$427Pd(6emcS+;K+D@=#S
zFlcp-M5#TpY~O2ImhF24e(rzGQ0S5?GEAtOW&372RB{qOzji!(PL}OE%l3^NIkAF;
zQSwq$^2@S)XW723oh;jTmhC&s_FYxD+)*@pmhC&s_U)I)2#?F8eKeV({q%|4$C6r)
zXW1T0ZJlNN9-ERZ+jmw23st{UTLUXYp-*<2BnpuNs+c+y`YB@qo<#qd=ldEAgLx)N
ztXsx<?$X;+!LKGJA|1g!=9s7(`8x3q<O@IrbJt9w^@;Ezl<So54pOnE7y1=cJT3!7
zkySj-Djp+QFiRl^m1yC<m7-T!#bY`#QEAv`9I<j&Qb;K(b}OZPu<x^q$63W=R5Hmb
z9w&<CDb!t7@ffw&gi&M_k0DigsF=1M6vSl_-Qxt2tRQYy5H~A`n-#>(3gTu3akGNB
zN;^w<v!lY2&vilE>WG{$S`gQ-1U}F$0fLB@CL-7SL#w%qBJNvuXNV%;ZXXJ73nj5_
z6oD4CO+C52EtqeVFadbPLUCor_swf&aR)CJnKm)&5r*K&xkUJ$EqfC+yHUCd|If1K
zaLY=a=Li_bY!N{%S(P(FYT6A^m|im2F`fbGd~dR_J8Rc7qiyWzy|&eCOU)ou656uW
z*DgwC=_uB$?;{nkl2lb$5*8GvbH-w2ct`A%Xil(tGA&<4VA}*Y#BCHOx{or|?_xUj
zX4C9r5V}UxFRMM`2ViyFU@3^(@w=hk@0W>Ih(NqmyhGF^WIa&|k<&!w9Mx>4%q!m_
zo%H*TXE<t=`hdlpO{q$+##aoURV8ANcor4^P(}NQCsj?^Bc4<R!H;|v9*X|ZJFeR(
zNX;6+OiC#CPU;88@J`$6VnY4e?vjhoUqFl>>lW5QLydD(?zVd96DqE%X~n9b++ma3
z0w9<+5@JirbfksEf};#QrmyL!<+AK1D@uU^{KCLPm-P`@n|xpzLlhd<ykOg5cbNue
zc+2MMjJfsJw5u?z9x-um-M&KtGj-4>8nj)OhGYqGQU%%^Z&A(9v5m@TrK-7Qqq^hR
zj0!bua{Tz;&>h0y3*Hc)a!lp}2p^4MS7%Z5zK)WRd9Z~dI~!Jul=rx>DWg_8r_{j=
zGC?BBRAY0Y(!K8Py0Q*!9g7gzk0L^GwF0W9v$9vq3ZC_s#lK0`O3`*yL0%p)QKg+3
z8O?_*f+`iNfxwR!;C+i-L36VG8UOEbPn*4lX|PbdtL5DxCSG^7H_Tp7m*xHTo@rOK
za&hshd@bR_T#zN3eF23{np4ei_-f4^_WRDj8zTg@2;192C!1{(ogur!)cY)fJsXDg
zg15J28bLL!uD-)^oIc;iE*XHqSw-`iv1lz(6MBrkAJc6;KdYuj)viX(`SEinI_+gr
zFmT!_Psp^#NS#q^hk9^q1VIUBwkmJ%Q?+@XO`I63FQ4FLo-l}eCRWVgVRY0%Bn-!e
zy9MP|Tk-|$N^i<>AzUBakUS|EwBiPG@~FDimhF+Pv|3>Gh&jitL2Fxfj;<cq9ImNC
zdACqtCeKVL>p5B?Mhy}*%;|9kiQp8{;=VlhB$O%F@*N9ipfC?2OJJ)U>$+=^>9FGY
zXyYQr=-LB)V42=N=O*GDY8QZu^Q<6%?Y8NW(|P(XHWO-rJG*4%eu637jdOR1DNXcS
zj&#RYu$3f4#?JSG7Nt9~O~FkP@=t#sH(*S{1_n+q8dp))$dBo+Bx1TN_2tIADpD>O
zBYsy+T-2@(t*#ODEqFbJG+BOiu<>Oq4y+K1C>s{GlAjCO0Y5>t;g&w^f&rPHC%lcU
zQHUjcKpl6eJj}zU?SKWzJrdI<z0$Oc+B>EuG9yJQia$~pz)T7&Sj0WlVN~7Z8Y)o<
zlO`=-9)mN@HM^}sc({kl-$XrhzMhkKY_WFKSj5UCSw!#}MM-+s6^T*Nl8RyUoO-X@
zSBwd>klgT!-KZZiz9b2%BTR{gYBauhfu~%{2zP37T+$<%YoP-!5_3I<6Q7r_n1+y_
z#DF|eVR~85kt0k)wPtmi`W5z~mg^)X?NYe8l0+DTweh~oU=&>k^<hQi;YroT%!2ih
zZ6s?ysN*umwL~tbw8OIXGFDByXPVmDz-+B;H_WcHcgZKax=2O5&#y?1_mar`4T(dN
zau?bNcdlhqJMokVr1&5xL+vN*=mtc55%Zc|P)v7U9v$77kz#UDy8}5}$#N}f14@!2
zi%2e6Al8fc6Kp|wE0_K9<;xI<&`>_wdPPME_e<{<xsq5cO?wMF&U(|-jVpotn&)Lr
z%PS#$ZcV!_YDV6&>n)SzwoP}vR$5(}r&u<6sbT5v`b|)@DXpU!C9gkrGfH@W>}J%~
zk(2td+v+=RJ(9%G++D|muzAOEE%J7~dm|Ln$mldpb_e-Ow8AdbRUxn2DlS(}4&j}9
zrnS@da6-k`GTNb!x|T>*=&z?*W~`7gaQc`R4B)!@l<%*XiY16>HY&O`&BS_Yr#+Y#
zM-JvvW4WQn!nVm~g81Z^Kl>)Ooa`Ei5Z{&w0YeTM1xn^(_aY~Rzk}5sn?u}SLAJg0
zzD(4FX2VT8;kfk=_D}&+dkTrWt`=?ur*O1B=$Ec31SPo70}J|vAJ7w=%SG)L#UE%W
zzI-o-hj6d(9a}be3%Q!vcW|gltIYC`58x~6==U>Y2%T8nF6<$c9-xo(hXW9y3A+K$
zPi%CniSr(M08SjX8+5SXcTnard^dC_a<QXXQ7q0PIiHUJ9QEH6=r-gqn>_U>^R$N0
zAp$dlLN>tgfH8oxThto{;5mT%P~-8n2q;c2WKD=(gCUdAiaDq2Qri%*2GbEt1dl@D
zxsVH397m2@sNX*6__q~!n+8pLFu)<{YL}W_s~&vi(F4;G<O$Od<I)K{{7Funuexx;
zG_Bb-o4b7nY#q9F^reoNvh_>J@0Sw~dJg<iEKXSL>t1{8-i7FQb^dQ{1?*wur9K0l
z^g=GN=;W%5o1@DDPTECLPt`I9ol}dqy*)Y<EGu+hBRKKYsVz%xID2-NZ@R><_26s3
zNJ1je8(2$NO+Zn$=2Djj*@+$;>kByy2uFCbiqdmnJ7_swhzDW~z>i=slI({23vwtd
zb^)U@Oizbthg%q}Y}v7f)x9{tm1^yeB;XUH%q)74FEvcFUog1aFNt!^`r$@By2l(`
zRw{W+>>{c7yNd&o{RQ!cib$w_K){LXbz~R9tMc4<B1Fywdo56TXn%;!eY5Y+db3~W
zZt3a}q_9;Wk~W?Gz9ub!oM5;0g#mUZkn&YWiqka6T?@;Ykbp2?KVHyn4lLRz0sG@&
zh24z~vIdrj_UJ~BI+c>Wz%(JLHz6|am~yDneg-`YZBvw_!L-@7%m-MT#y-?FyaIMu
zN^$5u*TH-UJ7~m!pui<(Fh&WWfGMVlyidr@H*-=i<HD&&qZg}*0!I=G$6N>Vj1lt$
zoK+8yt;-!Ev#Xo7XAvo<&P3r@-?DK^g;qZtOkjSnJ9)5}vnOinN2KMOu`oB>F-D9f
zcYVKJbVt|O*?|p_+W}`Rq>Rvzi8)ivkL@?#c4=eZsGSG9AsPX>iRhy=CtVQ}NLQ3P
z_A}q3&CBgUtuP03N{0)@B{n*>jB*NSbC%LMOX-}Ybbg`p7e9mCvKOP=viv@lR(2!$
z{ZH|G=ih}lML#^Xl+K$@>eC*-&cjR?*Ss89JmHj_K}L+9T+i&gsUODL+RRM3s6o$x
zR00{E%-J@Zrt8w_0!#2NTuu<x;fv}HMS2f4xYUpBOTlrc1lWo?9g8!SV(PnIrO!1k
zefyo45e1&~gzwf%|CT8x?VQD#DhThoNwG)hPGpYA<d^zIdL3fF@4Dl*p+7{{N6PEK
zb<iC`ub)Z%Fp4!S!zN9uF%fbkvkBLz0NE6hVCu*7>ZD{O#hFEn@@?>CELG}<@C_y6
zLy%SiJyko)ZULV{@gHAW!vV&Df1`CN&dj_?01I>i-A?@o%Yj9W8m(`b4-Qo24da&b
zO{y{psUK1aN9p$VwsO$TC?i&2AV|7JeBWH^NA#u0&o|;dOnukC+L)P{&bOI<23?E%
zKS|dshq#Gt5*TFSZr9YW(1j8IWXv&~`o0%%ij29gu^5y6)~O%P+H{_E4sg`1p8BzU
zb$Z8t>c{dm<%wnSAmvQOC)#>Ppb0{VsUJ)wa^=vloy12>Moy%DOqc6GeJKCR6xa@|
zQxe#b`tdCUAh9Hlp*RzHwG+6L)DP$)VJRmdV^TkeHJ=&0J4b;aLMFtpCr3d~Qa`pb
z%xu60Cp3YXV^J~b9};joiFG;=L6Z6*<@_ay*uW^bg__Y!`bF&(@qQC<3F%Zf6reS(
zg(?Hjc+5lEm*|J(7e4j^|2tUn!!sy2h>a?axmV{=Q45O8Dj(*#u$g>Z+fMr;928`O
z#4(uLkq4h%_XH3$^}|_<fueoNnVA7<LHyIl8xvDMwt3&&Imwt9d3<QHIu8cLTnq!_
z!=;q9im25Yf(`_Q)lL1fd@TVsJRZfC`oR=et}3i}VjSwIj9^&dZc6=l=3?gR;{lPW
zAIkNh=GmmPkf0kuXc8wScqCzVAuK1b!=-+dD@hMl6x$|HRsi9MC?hPD(Hbtmhj21n
zX#u&I`jOoXK~iJP`ZT_k)Q|IP{xKDisbDF&YRYvKIr7pk9$jVMW57t1;V@xnRj@T4
zwTQzAFoAu9z*-VrEMy9Z9iV-~^t<*x9TFz?5|pB|rLZSa+WCL_sTW>|FhK$q`m2)>
zL+0?8ny9;-&tZx6f*>9jMA0f#i{&_AA7O&`@9hIk(@SQGbmba;<<cV}bv;L=MdDet
zw%P3iG7HGJ3&<y;#2ASP=kFld1W0NSZy@xaYEAb6e5+lcDv9eMAQgymU{wL6w7jne
z&mf!92F|%tCA7q|%yC(lD#n)>d<VK^KFlJu&br{jLEGT5Yg%sfQtomz+-Xu>UX(#o
z!FvEmJob?cN-_grW{7^pcH-y&qn*99dE>1+^_#bD-?%)xWd|A{x7}zr_>{02>X1R~
zl;st3NbZ?|EHjX0`azZ|1+8F@B+@JYczaSYhdyJ*)^j%=n$4l7b`5rW1ZZ0So^0sH
znjJAsVf&ftC1`(OBGC#CLvc9F2fkp%n3!$%-NO$L^X(VgFSS4ME<Sv6?)!4#Q@r|9
z8|_cGKNG+5x%P$jEA5|;UwW}Ui~o!8qU~7)7H#J@+H(<1wEemEE9!^vFlc|;zw^YG
z*6`BrCtvz2?O$nMZ2zGBt5I`5X#dKydiB9;h)rS+g+6#3B8Tw5zA<C8;NIvPMWp3`
zoZ>z>ME`zks{YMA@yR*(+5GGR;?oy^IyI0zW}*4P!h`+x@_yMUmmi*K&pyAivQqB4
z4_&uyHy4{F{I_E-Y@TVKzOdl+`YXm_bH(j97e4sb-@bQa@q?yipJ}Il9*H%`G>V|5
z**V{cZnTPR&{uwL?kxM({@wU?{Jn$Mn&nl!WR%UqQlnNY)JiL@!s<$OwXjlNtyPz5
zmBr=dmBaS$cm4);=Jt<1JnZ~`_;B#6(`=*T2zfFo9kG}vOs5-d)!%FXe)~s0Gx<UL
z527EEJf_`_e)(McaW?ULr(&zqz86yTfAMlAML%~^bXxQNKZ?Q<uM0r&1@cdB0$13w
zM^D#$LM1@}V#pbvL>Ylc8A_mP&IVC;#+HqAYdC%fNE!qTb<=DhWCG!BOyPw9MChf^
zAQ>C2y9m!`c%VcnFG{rIC75WC*ypl}8WhB6V{f4Mp$Zbns|1i9fZ1F2TXN@y<K1)+
zLwaK{aG)P<AU;yAOU^t)%SBJ;(6I%-XS117mdli~$5J~~MnC8sF47gk77Rdo5lLYr
zs27>nnNtKHjW8*j3%U=W>PYCf=M0b;@kCwaqo$N~{*Y#}f6YICgg>L&S?5o1aihZ^
zyM&I`p^2@1K_c=Jnwn~l(R}F$@rx^Ts&C);Ec(Vj?Tei&{3lZG<h%T)q_WOm%IIlH
zjC4#H`V$XDJf@!qwFxKHm>M`LzaFZ=MVjF$1~-N2G@-*EPbD=)CC82MOL4U^A#I<l
zVZ0KmnbZw-8}`047<Lv~A6##J_goF+wOe;g#QIED4Do{l#c<|ajp%PFydI>tz=%%W
z_8jez&P8}CV(M_`&%nD5UaPhm%S+Xz#X`ANZWU@qX{n$u8<oQ9a!EIq%cT{w(m3q=
zIeFeZ8tD85K0z=3OFsNNkHS}u+^NUV{xsAyaeX3XS%#~|!6EqkS@?IPI3L5}I$we%
zAk?E34}Y?3?n$_@9<7s5z<+)wQ@}F?9P$26A4R=%7e0VX_wePed@h>E_2abakdSRC
zzOip~=84;Z3ZlDypH4vLSv<Dz-lJ%(nF4<13V3GEfrWd_o)exszF4zmhgV{fLT2pA
zj6FLr9teMw!V$__i~PMv*|>g5uFK~ltvq0|nXxA`_N2`N|GXP};*fz6L)5{G2vR>=
zoSSb4Ij)n78m#yLy!M#;2xT@bBr9#|9ukR4mO&yaw<G&HpowC6PyMjI$%G)xkJzH)
zfN5s&9slcEyU1jXtcgvX5U*%>j7X8JPr-vsVhgd<G*Um-ISkJM#O)!5V<Bl3k!{59
z`6Gx8GWA25kCs6pc>7jgMQqIAQ#c|a$n=NR{k%l~p`<Uea}JpISF%`!J1X_7u@<d{
zzngS;_u%pHw@QrEc5M3YNPmi-q<(DIC8ROHPe-|T=&A)o8LV6IpPJ;~RY^KTN@r^7
zA;KV#CeCT4ev~(4^IyC79Us;k%V&oHMu88uV;5{DvsUT5u+daefTq(O_R^nLX&oj7
zjECvQHC9@Y5@u4^Nev)K$QrGj+)qC>9C)c;#S%)`qCAn^WEnXF(T7NdH*f)<VqO9n
zCRWlMNDg(b*BK}JJ5#gdJ1?coEV;+T-FpC1-Z2MAS^UsKT^$lDu@tN~BLa$PjG^l@
zM0uvqX8LT}^w|tiek6i^GMN3~!uMu*B+ssCv#p^mp=-?K#6Y041o|H2*rQ$B*@KF=
zW7ZYIhdqlT7jVb-X3w;f^MXKPPh+P?;^bgrf5<)N?1vwoWlzU&)2k~+b$PYaD3lv3
zz)kCo<$`W3Efvb;WfWst0wmpN05^T`)1ympNEG$KCuHD<2k}#L;<yhZhi&`$K{wB)
z&AIxhQcPMsK)F$}gQY$0Fck75WcQ?f_axjjlR6V%Zl{TN936}JSCwF?RS~K`w0B*A
zaIfBY<JJaMhROG%R1r$5u|VAE3Rsp$>SRGT>7$UvH5H$_7T>{fxtAibFZ)v6tduGh
z7BIOo&>z4Csog^fIO~B9bJi8x*w=p1u~Es!-L>YCI@^Yo;k!CYNvUE>-*nnG1nD<X
zOU1NN<HlrBX(_itRPxS%7C;#PfZD{@0yHU21<|5yec++`ZIh+97=nswe>(!;%wM@R
zC%RUT#Z{0dqzeEj(?Daa3bq{ekAu2)<6Eea<qUQiZR%;?>YMhJTa2E984e#XwE`fO
zy231#5)MZ+106-PP_h^8$b^ukd#2uFOV^>&HlW$<a$WeO8n@DSuWj<JV{btzQVW_b
zt6!sqw-9LTKg$`C-rkG#R{9vs8z1=g5Xna5MwFbwuSlGPP-4qdmAjxY1NY5ZLGKXY
zea{=1rdPbb2q5_B$fj}v*~ogPrnjTz;O&N9--{9XCKhkw>J?cHmaq0e2$-WQ!yOiZ
zug+@-7h1()A(5yoffS6vG^*mWS5d9m^ctMdE3WH+9vL3RU$2N>uWR|MSFR|E<dkg4
z?;zUPgSSn6XNVFr8&_}7O}~e;Ix?AqmF2)lZs3r**wC1GnYi^x5=IPfa3a$0d<V?8
z7Yien8|$jZF>3sQf#K;ybTIdaw7?+x3{5{_cAn!9=eYk73N+%X4?YgXD#S(}9a{Z3
zp@-B*C%n0Ils0;lK6pm`{Do-ui5@DuXIf408BA>)T#rR%-1=zjY^7o>)@miQP+3{7
z7HX@DKnkw_Gu$edYpv$;%F=S#$aKKm#;_5sm!CTT0NX$N^mAg#&U8QzXC_WWB3|p3
zv7Wp1_S?6vU&a;wpmO45I$)*)W;)=A=^)bqQ>FuEtUP1olik>vIgi%K%$(PQ<q<N@
z+Nc#cW92Dh<r%&t!<S_Ek_=z+`rk!H#8YJ~oM2RY5pL2mvXgyf5~yxm8aboheLP-u
z`RUj>pIVBwH&Y9xqz?=pIL-I8W!Oo^P-n4#OdsuLuBt2>?<9+QB7M-r6u^@KjAxmR
zH?7#o*`s`Ak<aQZ!H5T+PkPJ(@Y<R7>9d0Sh@Dz#Lu{7HwMDbpS}wF&<;6nHL|&hj
zC15v=)#g&kESZ(nrKKsnbzcl-dvv_dCsOmJ)FL%c5|YlnEthyJkpKx^AqG0bVvb@l
zpBI?gUBRYmCt(I-U|mvw02m+F1~aJXM+s^=3f@eqZ92nZW?0M&i+OYjs05%ijpb&t
zWH;^CHrg~OOD5(MGa5hHY?xCdyLsgpjDBPwPHE?VG7&|P`tGVwlPTjIQa_sVsUtB`
zKctuaSdp{s=$R&q_*B|F9883&?U-J@;aI4cu~Y9mh)a`5G~IUh5P0=Y8zEYVf1`fm
z-=wLV5G~Yqy8E3W(gS85k#p-28C(6)P@20Ylj<Hp0Xj0Ap)elnQ)<`DZqph{o!Fat
zdtlma)AG<-WTb_!MBOeJ<i|n-FrGgMXH#wBO~)1Kn@62b)xjy=+fV@Ngar6b8&z_#
zBM?x8!l9Uw340=%g?Mjn>RlH#P7SlABjQym2+n0S10U5vlG+Rp7wM%K+fK<84`=EU
z_@cNeVmpu>CJ#T;UcDkG2&c*4pX`-47^W`8#7bkOQPqvb!lGGQF4S5QUR0?T%!;v6
zY3a)qcrNI$_{<m$NREm8!B51d{_x<F;pusJ@UlPj!-JPN^21>ip!57BnfRr&6{3D>
z(obq#9b=S<V=A78B1WK0MiEjdZi}6sInwA8aehqdPmqZsS16ZCtaV+#pU)wqLEo`m
zvyPHj2p>j&{F31LoU8ZxN_J(Gu#2DW`B(19g*+mJ;jG!&U*B~4cY@!LNI)*;RTMF+
z3iR<P>Yu|}%R*b`dZ~zts$~XE;WvC+MFsMn{;=*jyQb}~mnd)aO=K#7Kd6oj40_kw
zUoRI+bGQ%PLZ$+}hbA{MS#m{!uQ0ln^?c^pp0t<=2`ZB$Nh<5PkFX>;3Cp<|Ph0(T
zFpEdTX6KGkX{q#*1O4cqT#Mzmd^+8GYLH6(<445wRtw9etA$(0IXxIQ$xiRz+jqhI
z-co6~^8=@AuIE(V8vGja<nJQavyRNWy4E(k{noIHlr1hYzaV1^)3LnmnuBB$$a+19
zH}Hn_z(n$IrnL>K7fRx9z5yjTm72*)hHHF7*9o3|=&4%k*>RArFoJhg<$K7jNV*51
z7Ws^LemaB0gCA3wpbkeXE*`wxGm*5T=^kzzd;+PeZSOEqdGTOIE=qon!}f)PPZ(y`
z(+?BL6*>nR%6q-!nGe0g&Tr#$`(h|E%Xj4rIQa3dv(q0q4ZiCqVzq+~W7Q7-9v<)f
zCjO{rqIM3$YYAPo^BcIn(c!O&es|bVyCzMIcUF2Oe-2*40T54e%>Mh%i+><Z{|^5~
z{g4jw+B5CtMGYCH#0ZGoLNx#N+l~!020yk2$oOPHaf~{d&Ouz(9<Y1~b0e|e4*m>~
zwF@b^bQrB>g~iM&dnZyN8Cu)f!>|e9FklJYN&EQPch7VI-aC=jCvVG6MCP#)?6QA4
zQ_wk*Rp>9K?O?m7O7@b<_RNNm6cfDhtlMilLVeBQ09H|t+1qHkWTrGo8O(en=WBMv
zowLe9i7IgoBoYkLx<Yl1e`N`}gd#LTE6{0SKUbP96a>&i!estbGGXB*$63om^Ryha
zonPbO^XvFCdX^u57G8LK{xEtBKIwBho>oW+6`xdkIy_^CoxjGA+ZPW$h4p~#0b;H<
zbPwa@l>a>^hIgHKen0h&OE=`tK`~B!MW%tt3lXi)PGCW^C=$zIv}1*orytf7?E3V=
z)7YX!V@o7`g56Z5xH&v{tx~G48clPhptl-kp|)6FE;Jf7y-=$*S1X3zSZTIekWa%5
zIEU^3_TZDD?dvd>1NgHOCMgCc?|ku9toa50c?Ey`g_jwO)oDF?1|Q|bJH-^9-I@HC
zpMQSO<g{9JPjdu?#2yP-%T1v5Ct=w=O}|3p!uMvwq;nIxMGp~8vAggO-A9sl%j-&(
zcZJM@SpoJMk*Rp&aL}b2sqJ}vcWq$-Rdt455o)Ji1W0?KXBw7X*a7b_2m1@=L$f*b
z*ua8#p;RJaKK_<I*wy~_u(@jv8s=bUUc1TMzO1cut*u)yL?eSGbct(i%WP?3KeTVP
zkmDD6#y1_atNo(h@9H+6;7xCI9i&~=uC!Ws-VN`(H+0d~Z7aC#re*84qzMk&LYiVU
z`K>OJ6o;)L?=N(1=I;$|yVf4SZgtIh7_@X}fC6ewqzLx=gzUxxb5Gma8dgf>m8;GN
z>eH%r)p22V@!G+In`X0ZJ5ZnZ!)L!~+P2j~FE-3Q7>jfxxc9b+xeRZ+4natJ$85t8
z5Zn-0+rxX_u^<{>)4g^RDUs*3H_;eHVxmplEB!t)&N@8{vjqe}^lLB{CSb-RJ0=z&
zc*9-V43LjD9H)VV#&0<;%ukrrpnV#=^c}bfv>V78s@>R$8n`pS%%b9r-@v?f$LU)5
z{+88*mZ;&~o)@mq_|5(_UqQtc?H&vtUGIaSYosE$31X{6qWR7pNzehW!M+Du2%yM4
z?Ml}|UE`>`H}$3oD8jC`VWF^)4(bN=bzKZUy2q!etE^uJ<DK>f@=y1*EA1X;^h&pI
zU2k`xwBv1jX)q6OsC>~XTeHw<*VOKy0ey&jsW9Jx$HhVoAWwREOG8A2QgGtU+`Mw@
zZjketTgPppDi=0Rskl;JEv>FraBZOP)%_igUs&|%!+DfB2)FNJ?y<+-@+hsfTlgT{
z<4>3c{Ib&WMAiHD&_r(LyZV0NDmJ@Zl0R=c1M34ADC7&Wcgw%u0o}GNSR~}z9UX*7
zbuqa29pwHVI8M)1m$@h`!-u5Iz@-Sc)12Xe5h=Z)%@RQO+D>2HEA6PhsJ5}|v3nEE
zk1R?608{zXv8$U2_7|zM$T;1`eu3#$!=R8GUGJj$VUTtmR+W&VvHL2eQmv@nvbAqD
zJqP@iIoaWE3vU^i5Mx2`us%7edjh>_bxrUyG)&60kdqzuJ+e5JoE`i90q7TBngi^9
zxuK=^EOZ06>;1mEO-HAi#3~_@3!J0t^tnGJ6?RuMAA+yKz{O^$n^C9lVfIl~4>cSW
z!>pk)B@)N)ptR$@gW0cKM;dhGCZ`44+0zF=MD+khb!qO7x5t8znm*iN-Xe^Q3-`bp
zctflJOWjGeCP$3xS_9?YmF3NbnNu?#pg3BPIN9&17c%LhTHyqB?Q~(BH{IYGs-<I8
zZos=@8t1)XzYi)vf0{VpV4KCjG0aj)U*)1K%IqgMIBIRIVuJ%CeDV%Q#&(&*=!Cp>
zZ)ka@w#lsXfh8m#Ec~2ypD%rP!Z~h$ra-JfQthM4CXe!4-^OWfdIhXeAw0+nO_%K#
znAN`Ev<h6jLd$2SqdWXfeei(m^)~YVbHT7#qK<;?^SA)vVr{hvw_d|G!yF@zI$Rpa
z-H$WA6&~J>f9U!{glF16=QT%Y3D1$XS`a1tV;BS{PlHB48R#-#nI4xz67+-qcTKae
z!FS!<<;3x6Kk6)OfK79EMBQkEu)H>llHzXZq7w=ZV_p1aQqVZMqmAIiVb|tqz!>Pt
zodoAOU<K&EGPef%v&hm46h0zRgX$Zo4q^z(RbuC2pkNZJj}dKSq)MRj%L_=`5OaJ`
zfYrd^#eIsR2?n?1*qmOuyYQX6;7B;`tq&ym_&g3YwcBp%4L9sr8*c|#k14~^8Ej9K
zm-I_Kuwla*fcKCiIKc{HTVf}pv_sEnSm*~zE8wgTULuQVvLRH2;1@Kg4*uXmIE#=H
zJ(_^XKV&5FH;etq+QG4}2LgkdVSjFgJTMG0@sJRj1=$6PbVwx-4KcwHH&X1!5V9wi
z3dG=ft<9nsJGg{GfFmr7NY7|#<^y`2N1q`Cs$P-&dU{99S_TRdkf9Vu_{UPIQY)iG
z!4&L+RTi-%fLItBWE*^bZD~=%baxqvD@8!}aq#hnK8l|XSOR0m!8*>zgm<l2lrD^D
z277yZ#es>mOaoW=Ni;O@6>ts922py2VZX4nSZH+ZLR0UP1%cVHKNgW9U_O>xU>6Ex
zT!m7(P^pcw87_@>CtPGI1+vs?SfGpE5C<pzJ#uV#BT0xy9tJqN-)K}GJ+{TkfhoE<
zunc`4Yh(3fe>m`acOk*6qb*<{IUJ=9_#Y=#-=Q<s(<l^op{GcJk&zXwu9j=1#pP+1
zFsVmDPrK<rqjD#3c&-Pw0e+O_0DZTxtrkv_(m1)3W<YI*W*2PCP@hTkIGb5}nllqf
z*sM{5Ve{OZg>u*J>TL>Ju8Hp;F>szXga>uJ<IKkj6ns~w2{ou~?X*$Lr7coE*$+yr
z;2<T7#!|U`9&^eA6>RgOcJbmxCDmMaY(SB)@MS0)il#Rm!iIYNa4+AoAu|lY*R`9_
zK%rfkyP$#=mvD5^L1;1Q{f0<OLVRRyhP=z*yC4&({o_Qp7fiXGJ%Z?w>;lduzYD0V
zhB3#^9{MkMh=I{z58^bn9LQESv_wduP}4!N82&q^0S!eFBBdwR@5Q`={4n?q>z3@v
zuyh#~$$Zj^VppoNCek}<A1nflNhs)r+xiZ{*`l~WhXJRehTezumWoUR9zS#sk{5(!
zY*uutrGYtt(%EtHsAsb%1+^$Rfs}X$7zJt^D@qrl6#54M!na<G><Mg9m?~hC80_l?
z)Ixg!T&KEFT3x6tsf_|8maBxGNU<q^Y3;fN-70{IfMe-ip*M7Q3#ctz03xkT5emw8
z!N!3$DD2StS@4{KqWXoeyX|$P&o9-M%;jpSx}0PMOSLkL35%0C!Mh)3AU-X33~J-j
zCZY}Q3~)euV2eOE7W?}ieC>5u13aQIqi^&iDNs%wo5lqbt2ay@*3Mx4QUbwq)xTU6
zt_3trPJPH-R`5izgM;Sq(g;&|s-6Julh3gkLN4!f46zC=(mZ1{)=y&zJg-i&XO4a;
z!|@QHU#{YF+|ADK$|*4w`ehxrcP@v|M0kSL@LJOQu!8Fw9sZg`zpx>oU)WR<{ld<6
z*em&S@FLWZ7_K7fZs$7g+34Kh->4hXKVEyLoy=<!EYLKAjGVSHXrPSF_Xa(yk-T@}
zbWf&?*i?7E^syISIQ?`3gJGo(gYnLQRt2eSL`hj`G>`*eesHL$sN>d;Y=o4#mI!YI
z*b~*#pd|sap-R<fVU6JG=Q7b_;?yBQ)LCzF&`0E+iDl1?UK)dFC=-w(CY)~xKn5S4
zB8ZTB@8rs4LZKi@pFq)mhQ+|Cah%)#qQWTrYQ(9I_OQoJkEep|Jf@!qHP|L`PSk~s
z<n6h&8CKD>TAHn0TGz_zD=+ymq!Jr?o&XpJpNfy@T=o28+PUg(zB)!N(0R>h`d`;7
z+WNXyI#;9mD(tY>46mb7F#A3>MA;B(jOS`xU(nXIMd>$OU@!z*+E?-am)6hKATDZl
zv>?K?UdJG~PQj|>t#dV?vi3EtGB-kLqwH5!jTbHLYv*gd%G#22-akx)J<RX~vc2}m
z5hR{H?R*VyPAkdqAdO>pRbsn~BaKt)xgyGM!~}$4&leF$fBa9a)7Ky?gk;If*o^)P
zu50$W8tvt1oj4Hw=aVazH5?<CSOz@<$;tNeApxJOvHn;dO61)61=I?*_29Los!>{M
zn#+a7%F+^2GZR(2R4Elo=1RHQS}d1V^zxyC!2S)a2ZDnIfc7QoHHW|SuW;#6q+p?i
zKvJGo_6g3Vlck9x&gUskyi=bcr{>C2^SQ_69FMmbed=~2B`m{2hbai7XhGP<e*5LL
zVWFS8Z47WncoX34ga0@g10012$0CVg^oYe{!g~R1FFazvKNjJjt2?bab1cDB6Z=Lh
zjd&J_8*$gg>2M7n^L`k>#7ei1VhV~wrG0BpZj_~M**wYue#K1K{LcU4s(tb<W(HRG
zgm+PriQN?FbR&Fi>@iW3{O8Eo)W@Xi0oE2hcZ4(_<89HslLx`8l#sP0b!SZI+_JX-
z)Ef~y)m^^SqLtKFUzAf{RCh&|Qy1hFar+6yqv$989LYz?>j?pA<QYB}$sEyhfzZM3
zkBA+L)je@2Jc(%vQ)B>%pM{gx8Eo12UC)3g{at*zF&Nm6R(Tf*P&Aj25#l3AEde-b
zAA6>;H_-bz+`SMzh)-zk(VF0cvso&Z%!^AmFMv5y5plwsn@W4ScqS|KEmrYoEVl7j
zPv?clcfG~z)Z*pLys#s#;tSfiPmWI5#acP-9^`mr{u#Kq&(tye#!+c*&n)ZAn<jGB
zfN%iOb{{|p1uTr=5{g3s&1ZRIu3jYW0<S~Zi{u?k3bd3#A-{%FCq9c<Iu3Z5l2r!h
z{rycqZm+2Tv|C8W6J#gDRlWoNtmiIDmL1~79w0yma7HBo=uLzxxPpa-XOW`=G;a3N
z=8d=R)NkIpedF@%md)%v96OToh?hJriO@yfb%00{jHwM+DxxebuSoD#mUQPB_!bk<
zJyldznCXr*b_%f7l(d)nWBQq+tA02ep{xE8KF2XM{C)~_)j!1To&Pv|CNiA-c6cp`
zuKN49zR}^YNpuw(0=kM#CH>^m*?7)^f67~q%USTdxS_+le6Z3##>b7$f6Bj6|DH|9
zr5p0+;0>$C7$hJqYy&`L{I74!%v{uNlP3{teC^(MgwxF!EhxNw1F(JFsDq#9XBPlD
zUT8Up3mUkG9gZ)a%wYQQat@yUYQ+Y9VPVi%c>lr5gT0;o&eDFzsCu>jp1Z&5!Kr(W
z#!(*hkSnvaV=rtjc)fn3Q7M;}D~k)p@SW>k=M5Xd3W$7wVW^1pnw`VCo~z+q211^B
z@kxfjxdW-*LKxG)A--J&VWAwHj`*jZ>ANqCRuNHGMt<(xjrEKESYzRVDxmaSt=J7&
zs|l;ZVX0N|$LS7%fFJC`7zqM(B@|o5&tkF9)sVl;nuMc77)ZDn@_8D}ky&1<dZSuf
zX&D85wXssDEgQ)4VwOvVWuv-MUR-Ui>LnwQ<)uRh)kvZj-%oWLzdWHs=V&&5YR33@
ztmdfA`$V?GNdy-23%z;~C+S80c@2MjK|>C-SdN^d59WA|9Fatx?8*F|lB43uXgidn
zI+X6@&t#FW6NyTX(nI)#ewaGHP^KC`mTLUGd%Y$wNP0^q$mQ?=jS1#)^TV8aZfZ#?
zaLGiXRJ?&P@g&LNVh{T#e*EDHXa1-h7i%DPhGzaC<x<N0K|{}BOdb9SozjVDy7g1V
z(mhrC(W!d!C{caSi=nBL`GbObnrGm$m?Ctrrh)Om)$PZ=BEZ|Qni4xNk>pA}hc`TF
z5Q>ai=^WL|zV?uIm)5~&Cz+|zKD)0ZHL%4G++KNqZ-3$LFJ9eE{fw8cbju$=fQcE;
zkir4!5^M(vp&r6sQAAcBIJSU_LBfVc`U7jft{KSE2Vax!$5_&W21_n*U#`Nl!15bz
z<c7(7CCGV)usxM>BK1o)3kx>FNL9}*8yWGKGiaU&onTn0AHz3<zSIyRzhZG_W>eB@
zI9<uz0kX4PcL?Vfy^U^LNDvuPkP#>KBm6lm2K=MJim1J$QZ(UO>bw6HMP#z^g}#tS
zPZ^>G4HmG7)Q@99O}wI@J>~?pnJo+K0lD;WurqWU8^>v|HVnR`en3Bi(ilkONxB+<
zYav%_`uqF^<UJe?nr28t!oVyy^^?t{b0L2v4@ywbun&7Ua}1$!2{>}<M^hu&v$y_`
zfX0Aq%Vb=>ZGu}HJot)%tYf$g-1Gi?>PPiuIUEAe35U1sxZ6noWDFswdrs;Hu#9ws
zCbBI8U(DVMe-+8g#Tkzr5*Wl_-_NQh_!-Mvo0-|(PW`~Xx;^6?_Z&C)lytsxto3uT
zcYYd(fb*sF{Z!IrlzM9Ud>oq)sZ^=&_bl-U38imLrAqx6=4NMRz<wOJ$jw-^TyD&K
zxI6s-Qd-|rKa`~ea3>mq#Qv!ejL6;}T}=JBF6Yth09(LtlW;6C@4*amxBF4c>gU-_
z|3o<){_uqt;Eu+BVLVwnja4^B><<9*Y;SKvGgMXx1J$l5#-DIzrhYva#!q#X+Ezv<
zs|O24Xoqa2en4M}q?bTD^bJU7+x0UAs{g+;X^YZ)Byarl9e0^|po_6HB=bPWJ<z8_
zU+}n{(t{L7E{q&`BPRF9G-#h}E}m?RC7xkgb1ZoO(Z<f{Oo^v7)Qy|LjxaKv#)LDK
zfrRstFp->KfH+B1i3tJ(oHHZ_Wpw;-A{*19Gw2-w$6Gc8p)EVqsl$(PrJA>eQZJR(
zIGypwW957mxjY852^$2&CLw$J5af}oAs{AkZ;Z!m*&?XM9}6Ybk-adBSIV$KjHi+S
zCr#@l6Lvs%Fa*a$uOqv|VycgwR^VNLAWp7iey0l`9BaZbl|1u_qr^<;s$={DsunPn
zC@WBz4BL?)udK-Hc`ow$ydYhkc}Ueie=Q2B`a^t<<7@f-6d_grJ8tj%%kY^9fAGJB
z*ODPs|0}L<bogsBq>2q8q>4=?L#m{+@sO(jhqoLTQuXJ!p~Jg;r1Kx)<3{J-@Nd+=
zXVY=%hWt6WHbHa_EWJoG-v<NhS`B2I=O>A{>UyIgSBD*rEgnua{a88vr@vCM0e^Lx
z05N5FJy+xU?1TWZb2WeqrO0GVgKT-YW0=+9>cY)?=V~04&fa$`D=XnKABM9b_Yn#O
zA$*ZOqTF{O4dig%cZWUX>N`gpNPLiVn^JN?pyhovly_CYrRUUx-(c<xew#m6Bm5F8
zg@O64=U)GAj=_0;!OX_~xf;e|KxB;8M;?Oa5D+ot><&cIkVvtPb^XB(g8k3c5WnH0
z-avb>t?#g`X=6XH2y*Uz?pA<`%e{-jzPg>CLqM==dKji*KFnij&(&xbFQPaYPGRzo
zU<FT9D$dssm-=0=h*CqqhrxNZ<60k>>qxUTzl0!J{zNGRe8s0#fA#O@&ebw53G06s
z_4>xf5hGYs3uH~;!FtXC55m~b)fm4~l;XYl+!bH><(Q%TMgvsq`LL@u%x<t_Wr_FY
z(zzOKE+1nxYG6Akr<Z5eI`tLE$GtqlYtPR)-?{jS7hahA{^5s*`Sz#q=ip^OmCfP7
zYlgnKyi_ub!cw`W7i!J&YGHM;)+{t?W_7V<E-xC(tH}u4c{DaT!uCJ<_~Zy%UoV+f
z2bo@~pCBh6EqR_^mZJeOkbtI@j*gT!9_a*s+%xf!PMJsJ+mFTxAD6dGyIW(i%K!9r
zsHy)Y|NK|@<7@D;nPaiaM{B(!1(Za~f3lkId&)+U2v??!qC?Ax{F!V{=|pCTN9-gb
zmjAO)XFk%@=cb)t=%F(qi_)Dgf7_+E-@bMI@)OJAmaeR=nU55D;mIK@nUC~nkap%H
zMMVl2Au=Cnw@ys~sn?%{kMxwB>q)qNne+47v&twx@zg#+H+sBdJ#MAm5>wh%V5g4j
zHcaJlKlpvJ@i+W+K2UJBU{T%5`B^^Rpt(+K>}$b&l2-HtLnU9EwEiS*e?ePw=v!!P
z*gC@grzYz3HZ>&vIMPg|ew%Ek3OYB*_LJ;xX#LsBIWl2<p)#p#9cfbuZ}hDy!Rs*J
zd|Vk;@`)GuKSkCGb*nO0WH_=ojx0-Xn`JTq=YM`O+lzVNU(h`8zu?bUX8U6y5f>ic
z-0-wf56757M(c}5=#rQ@WVC-z)?VXdN{;K~FHDO=ji<I9&l8f)!M*I~35i7(q_d-+
ziU+l&zPA^S#87ln-}l7{xQP)c=>%AH>W6icQu8BbGKm4dQ$NmIahP^2lgVUg`f+(e
zQa{u;lbK8&-7J*)k$!Dt10I)!BlUy%d?HUs7=DoYet$8{6EYGYk@}u5gn=W|W#LHu
zfL7FiPM?J%^&`{N);`+6oB9z{PH%!v{g_^x7)Zq2A+gLHsUO8sfGCfLIR)2_MtG!t
zT(5^wRSeUbpoWQ(llo!&tz=aZr9z~>*Ix=MiNt{73a*p-uCH1FFjc_e(`N2KUX0Wa
zZ8<8r6K;+<NzxIa!6zZbQ$Mg*Lo9pX%uwiX21qKGMCym|Rp6ed1c#-5EXzl@bW}c=
zBfT>-sUO&?MBHKklU+!Du*D?saQcT9I^=TjfW_g>!I+N%4X1u+d4FF4svV4F1lsIh
zPX9;(KVjrPjSVVt6sCVCT?|%2c{3+wR!aSHsBpy*E6n7CN~s^q_0ub8VZKoc6EX{_
zA7w2{yp80eQ>G+1+fkh<c=-Q#DQ)24cbEwUd6OAu)o`pXGk5kK7WYGz)xOi+?+mS`
zIaUog0|*=g5J*!6+}sXA0R4rn%fbQ{rf_?GX3bsC-8H@80LkVjQh=^?t@no((!AK(
zHM85ahA8nk&~EDOfoZo*i)EPmItx8|s5QF&y_~we&dSfZH7OZ=hd*C8Tb3<pdh>F7
z-P`Y*>p9m$9m$>19&Lga*+7u!SG9?|1}6G4FpYHv8*vydec1KZ{er-`rt07n?`^1p
z)rom{r)^>ikd>Y}-8m%_W+a-0cyDg%$YU6FM<Ss<M9vg~=VAjtq9k@>fcVC`i<q%y
z8zpq&Us*~g(Z_pomvn5;Y*0C4iWLdq#afQ8#rs2Zus_9Y9K8zC0%t^Zk1vc5e8Mt(
z$nw|>r0N`KKwJD8|NJ`sj6p7b0~a?shxnV!Lh<)_<wEB-`A-5dAnpA&E_e7R9^t|7
zewfNXz{icw@9=MwXo5{jgX*T}2fkfhOJ>M8wr{l)`*!fL`>``_s@}23&$Np_H^vw8
zv8|i>?*Ht1E<Bt-k^asXz5w!zR`~2D|NF6cLbVtEnrqBI?ca_6_Qh8bF8CT^3Tx(4
zxlvd&msSh4%5tO7sFs?AmE~H?Xf#`k)z<Q1=K}Bj%-GT$c3!~`3|#z)_#z&BGMuAB
zgfuGdc=&gQ(e&`$(x^1x@1cLsF2+K=_+9!vw#VW{A7zvM4sYeweIh&Uy%TrZiCo}O
zdWaye!jC;Ca#~s;;}gXaeXuj^nYK4BM<h0T{G5jzz!OKEwT~R!G_g05p3FvKj6%6o
z%HbvT`}rI~XZonjW7gH73)bTcANici#CMQpkRcud4-=f-^RI{m<mPeAA}7Vp{`#iV
zzZ3j+&%c;g0+HwR>wP@xnY}(TsX+`e*Gol|5-;N#f5W#`47aC0tb5Kbs>`gGC>QmS
zp@iaGy>Frrg15h3E|%tSAG+lsO<xa9ide!cB8Omf$T`5vMEJ{>y!xT-K;T#9`mV|6
zbC#fN;$kMGs7&Hjl7A-gCZ%iY9ht=YnYeHw()bh-?~9nbRES;2s0;_SV=NrBfsoOI
zB6bEQD#}6Ib)oGR&vsO2Cj#Z4p{Yu{(%sYdU7TY)T@7S5Ma4LtMoi_@WpzIscpfYa
z+8)eRMJ<Pu?c{`{Z<s25(9vcs^&;dY4PM1`sdDaTX-TtFKeKaz|6xOfR=3G^uFT%k
z`DeKoUWk<Hv$HuIclhPUtOjv|Pt2Thwg=u6vc_k~q?Iw_92<Pft!Bl5b*HehS~Cl^
zmBr<PZdMwFWgQhpmlvDWYROavpUxkHyaEeP=TGn@ZrJJkTU^L&Iwx42VAB~OEmuAV
zM}=PKcJ-dVaGQS%)nc`<T)JAgWkXXQHtEo~e{bJ~0{)iiy2xdoA2?mQD&$%qAJ8uF
zT~t`qk@#NM+Ge-k8g`K~*hL9q)VNHqK$97S#Qwo`glJi}jP=}vY&R2a2$?~sn;C>M
z7ij1R#vaN<L;6aN6T8lZVWf)ke9i;QK&k20&~DN?Nawomu}9?&oN_jj@w!9H(;EOM
z!TOzgF>wZNaspd)cQ^HY{fQ#79h<t#D%q*;?5Fme!7ecfsOufCSDa40XI9g@){gDl
zlwjc{K?ICTyiFfj6W?FG`0OO&=}dMR8V#1n`e}S~!3hlHQ1FqcA#vq_srUMx@2V7(
z6pBHK8dLWxb=DhZTYq3VgY`=Zo#(25xv2NyP&ClE`jES<oQmVkLl_7>q5PGGqx+lq
zFcAPXQNGT6beJwoTVfubw2v;c)@9bZ6Ko|<77O=$SnHbR;_70Hc&w#XwNNW9E*2U~
z%Zr85stIsLY1wG5`tYpGTDLK5bRrz)BO<WYpP{wxsmdmq!7VemJvIU>5bZK4FN;0M
ze6p-kc~X49XxhVc8Yku^7~W>b9JR9mC#Bvh^fSJNu+_4f7C^k`!8WvvEX?j6V}2C!
z&lSrjN);oN(Qp6)qtTOYA02@|u|S*DFUG5y-f)ITz@KIr6_ENNT#&%SSr|rxAVfm+
zxFCA!`~ErD7xiusI>(ZPq^8t&J15f%gAje%ySB4uCjpZZpuF!4Q$LVbH8lu(Uvsdv
z2SoMC-j?7++ce|3Wzu;vd?n6r{N1c;hSh4Bh`jWC+L}-OQv9rTOWSjXT|;Y_Se1T%
z9|r@A-7^7x>U;kwAJU<L>OZn>f<MjN1Uu(FgvWMl6Urn1$=m~3&^kisGcd1T>{A8j
zQVNj3@`INsaHjrb(nLI_NUWnAs6aL#arPb2U|MkerG5kpqIJEg4<HNNaU{CKH3t^7
zE|kgnX7Lw`+FfUl0hLAV3N^CS59rmLub(LC-yJrH1#KcTgs)E8sUO08&htg*QOQ5j
z!mw7v2b}|x#4liP^z<DINKb0<O-CV{Jf^$>_|w@_Nly&qAVKkI%B2ACA_AF&nfj%w
zNoqzZBSqglVva#-%$TA>N2bn$_+i(v=d}jx0LarKw3+%*UB#01+xxDpS-69+AXqY{
zYlaem;`T1Wq7CMu(Gl#t{{cS3K0_N0VzAS{a<6I#b_1yin<xNG1?8v-G!@6}7H4K|
zwNk%KZ&D#TR>`(nT#$hYy3_4_An}k_3K5ki)IqKz5%X*#^&|bF0LM_&%{05nqHlsZ
zkWK8qdbT}3ySUK*Gt&mor8YPbay5?Q%3xwfmb{E9RX*xTyv_wo>_zwr&-Ngi!V~+s
zWd1=v>@@Y=epUEDEHD<(%<~_1S}j#pSF6oRq1s$rLfC1uRajYBEf@5b-YmB&rR7RB
z3p<_Y5_&`++d>9pgBl8aInr0%h3e+mMIg%SPOHuwo3Kg8zJ;b~N&B%b`0;BeRiB<H
z*kq~1fVxXbDzS+Rlu4eFCN4{x%RocP;_Ii)%_PtB0bTp~q=h(<F8n+ghd(LlzJByi
zu6<<v`hP!lmK^5bFm+gV#dxUEePZYXL+c^Sw^eBKC#C_j>42D*Twu|6jR`arYEKe-
zR0}ItwV|7(u4fS5`OMou2wK-Pc7p8aqp@**Hgp7|C%{th1pq0fBT^j!)y(LYIBB0s
zQuLyBO*tn}`<O{v3I5aVogvc4Aa%M8lBI8EpzWIbu=g7-dlg{%CndAb3@}J?^zr-Y
z58VIR5faS{d9BY13rf9-<MYBKLzXHmboQL)Pj9a3tCdO%h71FCjYe~&uu^U4h002$
zTy8X~Eu*sP=THCXQN9wSPw)I3)2n|mIzEy!bLd7n-m=8$?bkNium4@td5DTAAj#I}
zUP)U@foCF9q&CQJl+<zV#7!f?VVFCL=5suW9yh!CgiZvxwC-=354~%Mu)raC3pv+<
zH0ZdBu6p=qJvaI|Jg~?N+j54$)i1(xKD2k;wJrPVjW=#>P;{JpKXS`Ligb%xHp4xq
zcrCKY;{mOVrms6qNlcG)VQcoVS151UexKJgt*TvWdtTpNTUh8aqg;_G=88?Hr@mG%
zZ`Tz}%QpA4mfo|vK&7{|o@2lt4k<wLz5pJ*uf2r_wJWz2+QI5EClAn2L!`tNa037x
zl6Bt4c@Xa)%U1DQ-Lj)*_2$5Fi4EHZE(Agdeu=Q-4TvLcmLzneuD6FfJMhuN3T&e9
z3$ZkL>{UTtRN!%Oky2Hmj}kcpM8%j!K@9|>#ME9J4v0?22*vp?iG#d7C-!dF`T%1>
zcVw{AtX5sqAgeP>0*Uw#WWeDvbI;Io1{OrAJrhp&E|<K?@<VDi@C4gZ^N3e(5^dn;
z<7;82dyLs>wmEtr%1m$1G;M#r3d`zk@^(1_)flkSTXsR+6sQlbw!JfOhJ6fFHcL$5
z2$x+SJhUDZox#q6-f$Nx)zV_IT&>g=W#vzy(g0uJ^rAi{`_cwr1l5?Zdx7nF;57Ay
z8h^fAS(&daE<pqim8WWD9zp5x=6rISOSTF1h>b>+AtePctBi<(K<ks2PpbKC&)Stc
zBC^>*rsiQosNG@d0c#<+XJqi(ff(GkOvu9F+{GH$ah#p5xsbRKt;G90?dF~JH*e*n
z&HGW9G^Gc}s0Y>E%t6mIER>_b;v)%?D~LXT`aD;A>-HUx**?%?aU#H;nMgZrYlmPn
zG6n5{-W>`)2Il+bu*=rL2Efy>6WQ|i!0~jTJ^PL;#VWA1&0fPa3^1M;D;n8O<x;I!
zT3uPHffl$yf0=)f+uI|g2)R;`rIVPPa5IOQJkj4N{$p!5p_(OF4vY&v&jH|U=<-cu
zx1<*42*qp_ydjvM_5jnwxdTHBJ5bbaS_9Xc*SaQQcLw69x(-YiY(ch?+L&)Dj*D7q
zV7c0Nfu|BC$LJ)Er{xf#IQm)_OkVN7ZAT(4F^9rQ`&2b2?W!0j@A7!6E;PDMV}a~(
z;qHwq*WbEP>>1pwc;^mzW;CT?hl^YG299Eu9odCK3-q1fzjg*-LI80MwD?%i5bBbs
zF>VV}4!bpS$hc||-33+21f9T(_<zMB@S28MKu?>yYip#gHm|eDDd#<!)7a=Z4Z%6J
zWzSA^$On9)I)P7=Vu$?NcDRaanQ=%dQUIY~2Y|RQsoAY}^@iEKysZYZi83y-4}yaz
z#^EEPosLio1-7{vo%(3g^9jngC2)_seY0r+5iTWU<lTF1EQiVk6%aCx?}C4RZ`neV
zMeUwN!GdcZYj_W*PIT4>Nz`yStM<Z|0OQ;en%HCZTaeOVh|VHpmeFdoeJG-pSIzUr
z9M1EA)Nu^Sdx_o*TO_<yb5NtO2NI8oKC%I%hcxjNwQr%vgFTA{_(FPBJE=2tz2M2n
zmPnZr)P;3H$sA=yw3?u<9x$1BBma>g(<o`2P&j#}%>;?;Y?Fp1uEtFs*nDlY8-+e9
z{`_p=DW3v%P(?AeLxox*QPFUk?t;utp$UtpTYxkKA&dFD3L^A*_?ZMQ&lM`d3pqo4
zx&O*&*@S%&X6X{INHP(sz(jgFIISctHZT|wct$rHCn(r#m)y(uZ*AP$d{?`4>E3tm
zUB0aLAemJn^D(Vie1(TB$j~q#$+Zz_gy|16B4fI^doNvD@Y_+}-?Ux@B2@2zaNEP4
zZWp!t3+jb$n&5vN-oSiS+dxSoQdQ4HO$V$89?JCsd0FUb0EUCGa|5C$B<S$9+NA{-
zo!+u#j;hhN-G*#EXv6V@B@Ox0<1Gp#{b~6Qif)Q>Q9FU}Y)v=HOTTP79+Z<`mZ5Hf
zB*Jy3u^c+aekA0n^})mfgG`CJ=?(QRZoobT&qb(m5MwG*Jc!5&rms@eZkyf~&a^$f
z%M6eZjCWY51N`3lK#pTuGas_*6;Dsr@QA20+z6x&To0Ed`~m9=l@13g&Hf;uJ<yDt
z1=x?~8fJ<F#KHM%d6hDUI#7K<c026*D1Y*)?+8Whl)Fz<x=28ds7k%1VK#Mg2}FHR
zXeT$>2iZ2LfCLhR9AofisJXzTfg4AgR0?~^>I`k4O$V{p>J?l#a9A+d9W*(@WXGq6
zYr%8?$mj~bKa|HH;0#RgK}df}G@%HmS`Sy`2o_w_-Y{)cCj-?07!4*Aj*8<6d9!e#
z=p7<PsTYiHMxhl-h$E*;QI;F5MXC`eBs6Yu=;W>(I-HK9n-9pSu3E1L!>d1V>poxf
zYpDr8+>FS1q&u$mm60`?_T*M_sPG}Rfm|Fjw}Ma`oO|3ckRd?>SW<ij{R|2b7n5?d
z5X^8M=(dGTDl>+=C7a`xtvWi9o%aF0cd(Dt%=@oe*Y2zD+*03*KJx1<^WD#`X|t;F
z+4+&5r`5p%9=AAw$5mspm3G^?dnFWxD!!(A(#&!|@2E0CsGCcnFg)}CGNO$IQ8D;C
z(zriw4|E(Bp4ed`mI0d-O5(wJL%D#4BA?lL6nde0JI*e&e=;Wa7I*s+pipzrZ|nv9
z>UYUsCTt46mphQ%LCfPJZhV<UNDLb(3X5}#T}tm24R#@CfcSs;R$p#F&2R>KJ^Zd#
z`l5DqXd$k|?_2PCgkq=}Q3@VcmfOCrxafM+Y93lQf4wL>(1&P1oDi+cl}(>~Ek+25
z4po|(<oXWS9~nceS8VGY6Jd+k%z@x1Ii_(`v>|~7Iwut`%8;^fTtkw(=U~#L1<WI6
z*wpcWYj$DokNJZsn9=6r$m=PZ-JgA*yS_6!ug&g;pB@CC$gXCwEd8OUF1;JPMm2?a
z9F{(Y7(-gB%ZNRv-s|?oX(tg#iC<J(>POH>k_1&)79tuEtffZdtJ!$UrEmxazymWm
zF6oiHIb5W~dgtXgOoPo;ggoP}$4Bjt^<7y{h<d?U$_{AoT6P&>sjV^X_u6)}r1i?w
z+x*7Hb@u#Zt1FN_Sq)9tG((i5e;#3@VBe^Un|DOyzbRLWiYL^&s3`)RWb7MTc#4U9
zFA*{@ZBL+QtCFH8Wc(ea51988iEcj%&ipdVEm!!bia%o|(N}PBqqB;?$&%>nymFy)
zng1ke4M=-eak;}k@nXpQ?w4!6j*lCi8~hvjFWICtI9jNgZ&%lnX<(1-TkXWY9enI|
z%o{#c@7UvK+QpKX$9&k3s~u(br?9I)N~s^wMRlGW2cwFk-_BrJ-aC<UPNeG2mp&G+
zz9y7C1?|9D{qi#`aXDR%Sy~k{09;nd+7CSft1$;K?AR1K&7L#ZA3ZfeFt`KuaOI#z
zo($XzGns#H9QpR4@5i5<kUu+~^C}U{uQ3tKuj9|y@%$UOxFK+t<nb&uk}sT?ku=W8
zCI%SAj%POcOtZ9SSi5mJQ&IprcJ-%-f_#p!c8ylG(OPY-7D^^#2$w6Rg1)4$78>S?
zZmgOsMyuKIOIQBjYarxTAjy4rn4#+)J1PEun4#;EML9zYDb7V0<cK0$&*8Jn;5X!8
zeaexC6AsM`T{o5EJunVu=(-qcAw$<?=sH4YLVt7&9G9W%V9Q7&Br!wR9S5?b7YWvE
ztX&Myo1yFIK%}cjTsrXZWav5{BPssM2vCus>jXZSq3Z&ytMAWNp0f;H=eqC>!dos*
z;tX9EL=0u<x(r<h)JTS|W3Yk(A!X>g3|&X~m`I$6A~{ZISCIx8x-LW4W$3yLU3U)%
zbkE$`4??rkl2%SoYEvh+ITfbu{GsbIq=1UfhF!%4ycXzSpemnUR-TV8kgxNf!=oGF
z2+sgAHU-%5m1B^6sR!+?#GD}5)>GrbKFUbGfB0hR(#}Xwqvs(|fdG5gL1=4$(vkoZ
ziI3O}B1MWC;vM?{Zb504inL&7e#J`EPh@BXYEB?-;-SaXVdCmWYLpEGS!SU6oPIQ@
zo-_1RPjgv<3<3|L%<|cd!t)P^;)@E)od=9v`G|aWA)`t%v1gDh?uU1X@5X<fX%Kkd
zpZosdhl#>|f6YbUpZ4#@fBPaZX$P;hD&<mn5xITL<x-<i(<_aFUaGDXR;$%Tz1c8J
z&Bf|bNVYE=F(+A?xD$}8K;9dvON##{vqwoa#isiJfzn+Z8%^L1kZiz@Q$i*f%lrE1
z>3f>oXn0oAp4-TCv3`jGn#f_pSS#dDGEBm}%>k2bm;;1+jb38RR{(LFCTrS7?QKBA
z8BzTJv4t>(%b@)1E>;6E#?X~GI)t>&UfR6z)}8vzTeojqp53yU-hpFh5c_C1_>^R`
zp>ZAJDU&{6UWz%EjLE|LGD&1jljuRzBkaW@4I~hu&J@k-IfP9^B*tb)#Q!m|!!yz0
z)sBtz?eX?xVh}eTn$027UUAo8fbsAP+0ePinH@ID?Psc&p#7;tu7j62?S}`S+>xXW
zh$}xl_%W3=@o+T8;oxN?nHgA3_i*Fj6XpZJX%7>z4hJ)GQ5N#Bec|8}$YJQ|hsmhK
z|9T!r>q|@{bJ+Qx@i`v&!0#%&@ZiV0&JMU!gYWu@7-07>76<V^;P%e{7(Nr(9saxU
zS`wxIBV6C;@Yh7YJ8Y<3lcw4U89_SxNg)9i9Xag$OWt|$;8R#T1jfU=?hW0;_OC{-
z{#Up`?(*X+|2e*Ibp8eZMmIg1j!QS>&%p&hF97mG7|axiTpGsAMMTl-#$ktDZvR%)
zyXl9(VLkl?h>hc`lDT`vXld&hK@k{kqmC0fKf8dO4GTz$#so17gT})956;!NKD+W@
zZ)d-=wBIqRUai09?yq8doTG)gP#W}*RI;>VFKjM&y?)KE8};_0voNfkt8o<e+m%ka
zjI23Gj)p`ZMNrr59JUIoO}jIF98F$+c4LUVgh)TuI#)xxqzzcpn37r3=?;4~!vO_n
z$W&=-NP=Osif&u)n;h>obxSbhb2ZX$_?-_@W^jDWm&7r4O*@Zq`ZF*W4Hc>RkuXS&
z_gsy3H9vdIcw=J?S5-53=DRcJYM2#&^IgYcN-+;VABw+{!TMsnYIIoHb2YwiX!pWI
zPbAo2=W)EOT+JL%lKO|*=NfpDnh%VZl~3|>bLVNa32V#2%apB<pziSCwZ-x>EG%ZD
z(5x<33$<FgQCMASnT4ueGApgpQnl4AL0Aoi^25&G!2Waa$uO`A_ML;5C<`C{(tnR%
zj+T3-mn0^f%}1Yhr#e3#ha(@e=cnRvkIXerzYiaqvSXzc>;gp2KL5OrKfaJHg3=#M
zBf-YYG>R1dWJTEbln#+d!$KWGG@WDAolc}CJW3B?GyBQZWvgKJC=C^8ll5Ii{_wq6
z?=bf$BR`<rHylfHP4yj!SUkA<PItdEL}t%1qgrN2IxjA*vDF_n>)bV&yYL7KKvF&H
zedNWZn0n3ZHZ7+2A0Q)k8(D+fro|c%2^)&KJuA?i)rSJd{b#v25@o8C8oxtl&~M_8
zI##0dBr0N)v?opDb4r{ABR;?U*aq>-Uyl}-j=@sWIg-y{FP{JM*UQFo#VG4$VYRhb
zDbz~M<-&^5Ttp%4Rtr}emBp&=JN*6_<mIQX$anq(Uk*MQo>+&Se~Sx$XZZiK_io*7
zWZ9WuxomgYY~O0xF85@)%28BR0xS~%Z=y&Rp=|1wsVYgSl9;K=5mi6{2mnC>5y*%D
znapx}E%!sejBG!&dad^I>ec=W^Wuk|nV0E@8Lu^$=N@}rX3ejd?uYrlea?vs0FscD
zl?rA?+C>3@h!f|s&p!L?v-kHU0>tyg!?*Z{XcyXQ`7b<nOj#}bE@zSY_tyi2cnL}U
z`Rk+8q^%Z#r<r3e!D@MuyOX|On$?n95~wL1gsB{Fr|Q2bWmnZwEakYZq^CR&E`qW1
zd9Xpvs*O3=ELB!lR(O=ycv>zIQXU|?PD24?qBe1`jqH|jh}014fXI`wwZ@f)Ja`Jr
zIAfBa<@&xqdT4(8azrxcDx_)FU}jFl3j2>v=6dM~h;YDV!1Ftt@WUjYu_sEbU<h=4
zws!|8#0W+JR&qg;jpZ|mr}|YK8Ry<?vMfR?Qw9)Hf$g#yd`V8UoFKqqY+^&?)A0L4
zV$y?tV=-PA7B2(8_okw8L5G9(M6GaD>!EQus8Ao!<ZxtgLpW|Rk-D9Kf%>8H5!0jc
zo*jTq>!A+_^Z;oOfV2-4V}f?I<X*Ha_<*!CC~*al#Awt%4PDHa+_zwn0ajoq0+)dB
z>@aK!fnq~IkAWpiQjl}s1`BaFzz_gwl+#(ZT3{&vbBXV13IzyN?pttGEvxByEaY_&
zVr2l3U`|1eR9WEQ9(2<A5^q5BzPT;g|Gtp7WQk1h<-Bp1H)9U3JYfR9N?2!lFD8*=
z2>+aMtVJv(711NM<ITq(N=WB{HytdKj^`BShbz`YMmETUU9uBe<Vclp0e#BK6Ui?2
zPT&BVJa<$tRs|q#LdXed(AX_7S4few9_F1mv`jw?#vahlo|23$!n{PBS^5Bf^6hFL
z(Q;78=RrgKDkEd|rHf*!9e7W}sDm~?!>8Lp;Q7|tGi*v>Juu-Ar?&K7cfhyRMwHzt
zMzM7g*rO8et|brR6H0rvBKRPg4PQ$3VJ0h66U$N<k|)yjI#^vJuqVl)MO#8@pNRuT
zq(5#TdzC0rS{69TY5QZ)%3z7>D5_7}0wJ=Bm7%Exz|Bx~2O`koa$62_pb#sF*k&v-
zygsr)52miey7WOC+y=j-Acg75@XhK5#YoARW#HU^0+Cy&&>tKl8j@9V!32d35@VS|
zknEs(AT%XMWwvqB3Kbm_S3nvh?-qWoxWd_{tFklfoNcrfah_Z#5hm1%NSui;in#~P
zBdzl>X>3B!W0K$VuHtr)X0y&on_2Hn+2awW6Kk(~30p>f)}4gtYYZKypPslJIq8=@
z5UDD2mb_dnmevS^xcPX~51T}P07DE{%XZ&sACa^%ZnN&6mV*h7TY^)L8Sa#5Z#NQh
zsb&)tcUm?1K#@2@OR-9D&0vJ<ld<J^rfd0OmA_Wqkaw2j+n4YX($Y*RwB!zT!?cFt
zWAW8k^<=K5^vQt-?rlfEO+B`_k51xyCggq*8cdo^?oE*wMB&PKy>u5U<~?~<-U&lD
zo{_16$O<3x{Zi4f_1(w7wtXZstqSV4hTm<_od9K%`W92naL)-sAab}f9U!|>LNo%D
zZj@Z<;Q&+41}`FY#3|CKVFJc+attZ8Ezoh2pom)y^JI~SSmkLg2H=t$+P=y{nDOc5
zG0Oe$OP-RS30h}cb%+!v_>058s9q4X{;zO@+@(*RzBp2#sftTA<j?b)GuDp<eGQaY
zBj}@n+X4?3KS@mcAX?B-?Xqye27YnKwxy%n=WpvAG%?vjwk>r0nG8)DnBcqy!+C=h
zX3psLOVqB}(e0P00pFg8_q+OOI9xqFIjueV$&Je;VMlv|RdDfUquV7g*u%=Mwri!2
zu(+2lRlBxyNINZ3n#VLmFpr+9raV*GBB04Jzaa!%{dTEZ`NySa!l+9=z|s#sDlJjl
zy$hP<k<~iAR4ut`sgY6H2=<^oM!JRCi5Lqr7E+93!#?gI0rpZg@n7mW2H8W5w$R13
zPRpv<OHWI8%^WN}1JAqdm6wps1Ts-Hv*UbH#?W4>*4}~(iK{U8i&G;wwK-TA4Wg>B
z>2iD53*AqhdU<u_Qnh$>c@x<qYzWN{C=@NEdf1KXVOGs$6yG1k?MO-LGAMk5BPXbr
zd?+8bJ#r&#(Mx-JTga7TaJf{`kF2DDs!g8^?3ObyQ&pOIsxM94dZ`5VTTnu7r_4Yd
z)lRs>PRd9dxjcs0;T6P-rEogZnIoI4t?jL?4j8Adb~h>;s~d>jYi+JoHn-dBcB|D|
zS+}>dnIp?6Y<A|zUw?ge=EzHKKIS*)FHSK@o1F6-aj$9?oo8sg($f99i{a<7yu7r+
zehn4)8cOAiX`0G`d9Q};`j7eNC-|fH>@tf}(WLx7dr$w0l&CMU*L|bBt^ewGiv!G8
z@d6W=<0kw?Wjc+JjaM)2e(=HFTi4;qjnnO3374igz?59W;s8_5PtH37Ee<e?1I*$8
zGj&v{Q@baQHC+=sQ5;}?aR-<$8CLOmoXfuX75Rb<&+Ek;-SiWm^MN?V4oWjNER0}!
zYJQ*X%8h@G-J=Kaj~<kCc#(0DTIRla+>U&0)}@p6<e9RTP`B7EaBzk<Plq|p?;%5&
z)l3%|_b}?W*<L48D>#ZLt3OfC*q3uq@};K|ch13tKPTDD#5ZamiFuu_lXjD2Iz<1J
z_#RaCmGhy9+)}|`%J3-?o6d=m_;OP;6j2u`WZpW?(fe;5UgtlV^l6FWAgRD1|D=%v
z=VBUQfy!HwWYuiP*IvTEF$c-OMEcQO_4}m+9GugE&wn&;<mWWJ-c~B?Kc9{J{b@+O
zmE2eQ+gTXBjMq2!mA^I}Trd$XIXg@^_YJ#`*v8K<kU96wy)#Fs^}@pbMS<4&Z|CO|
z3ZDCR-cI85Un5vO_szWdIfT>azL9qr;>0M6a|8Yt7A^?)&wV>@ylz4Q0%#H|FP|1l
z8JPnnpw$n}rwm53a=_%iCEExLIV&JH&dy-E$CM$N`=<T=@$}(*?koLPbY{eA?(2T_
zCpTsUI4mrjn;CQoCOe(==cFzzsNbKKaz8*D5+6L<6RQJmV?V44Q$^Eq->M%?IZU(u
z-t!_ibKkh_Y-}k-?QH+DNFslxi#h*In<w;gW+YGUo3@h)w={z`GvqRH(IYVPGEBE!
z5x|L1OVh+z;g;q%9wg*`<hFE}rKz+!?O2esQZ>C4lfYC~Y3^HgN9B71f1Jka!mB2|
z=DwZRO=k0}rRBca_Px>gyTAKZ-s0~bE6!)`nopdT9R5ZQ(6*4QeA3su=`r&Bk&u7$
z_WR77zwZEa-wmhOR4+vkKr287Mne%@6J;Xxm6_k^a6F;Z0(xVqJ>CA#{cJW}ks0-h
z6U~3IDAiiru@+}k3mhJc$Z>BR;rFM%Sd`U}x2m*b_ZX-Y6&XL?I1UE<tsh0BaHm!i
zND`#mL^cROhP9#7aqUVEF(giKT2sVXETG0mb#?goeIz|t|LwSq%m-ju^p>qV{uo(0
z0i>aCyTK4R3S?Ee*>|0;71zUhzY7F0U~m1{cLvr^?9ssX_yliyci^{xN3_@N;`uPX
z^YJ)DS@&FX+a1>f>X?f=<Ff2y7e&55aDg9TZhIdPQviJcAcMK>W<LNJ!A<`YbIZd3
z*bd<0GdJvcod6Io2M6P|mDQ~q{-^5GwgtZn@Q<Rtd2rup_dOqET&MA~KXyF8G@up_
zoD(6SHuv6h(3kOTw;+Yq6Q}Q~j$*S7#AbZY6LJ^d1lU8{4gm1+9+)gR{Rj*n?u|y)
z1KS_E=q-SKC%>Au{Mhe%c%<iG08AAgk%0`@BXK)B_RzZThXf@^O1KN+Y`X_I8|yaK
zVe57;Dd1s%o^8jCT_z};i|_Yc06GkSlLI`pczmXB*4?}}a6YjfgGOx-eQK&kDMDgm
zYLqB6A3kJ*_kc2YDu_4x{?OVRxIjuts(Z(7J1yTovL3i#JFy|%kiJ2P<|p@PiMlZc
z93OC8bXwqEe~2F48&q!DeR5XfZG5Te=XV`%?=C4GMc+|Q0dVmU1=wTU3*kk3oe%_(
z3X&i9K0=O~9RSZJUCf=myN^sMoH`+VlMW`%N_A^>du4lj4c7wuq>)T1Z#)=$wrr)Q
z?`zJn*PilBY^r=3&+*S$1^k)nVd>_dlDpe_WS>@UV6w|4`SYG1xSs;JMZTb`<=+n>
zw?tl3-yYf!L@rI{-Y0%=6!`uyRF^p@M0UA{F){#5K$wBAcsLw;E@0>$_x+K&SISX!
zk&h2C>oI#X#ZL?hVG+5Ac8&ULXVhP^bS`6E#eA`FFB;W@78?L8L_G%7yAT-uTB0%g
z)>c+FfPmv!?*kW!SV3zmYpc+)=9a+0!U2Kr(DN<T!@q&k9gtfT1#|wm5g6^%+*24n
znD57sUwml?nEi4?*B-j425z@UBXt{SPyIG&1w7yn9XkVm#Q7QeqH~>3pjTZ7u#ch8
zv9swz^geRkfSRKe<_17>j`>W>IrY)|YqzWr=s$oMLTyj%0OSS3A@0Yr0>e2eZ{Qvw
zQF7cHVrnTMK<n5IqA^B*kdz!Mv=SNzsl8x=MceUm61-c^F_^go;Rk)EYZ}E$x~fJv
zBVE0L4gD9IYsd{CK#S2`3?{`A#u6WoM<Yl9>H{1n4@{3}IGVY#VsCR$u1fD`C-^na
z@9nkqd1kodsQl1mg}nKA>>}-V-+?B^u*l$+&A=~NPx;bkGuH7AWD3Fx(A!h5BTM<+
zpJFw4q6)gH5-;SHcF6K85z4Rl-3kY<($#V`Il}>$2@b)I^?`@6!hm4}l8Qp@bGd-v
zVr+@5IeF_%Of&ScwMQ77wyg2pc=7hNA)uQGIBLDlTfWE=o|9<>Q$jDp5IA`nG6JDO
zl|haYaX8o%O2(1njDW&q1EmS2@@dR*oE)6S1+Zw&j-(n0CLRD*$=c`LQbl4@D#Kx_
zia*X88fRCuWmX(kZJCHbfen0Z7#jfDQky3q^%JgPnjion-BZR#Vrnqs1X2ex1ZYN>
zxo9YqgsNjA+i0mWsLX=JzJW@@5=A6%VJ(I%gW}EZa7s?~>E5H-hmW9<u-v<!io`Et
zK~uBsp501n)+d?~7Fbmo(O8ZLHReDcpkRaH#(~fsgK3GG93rUFZxJPG=zCb}%}Znu
z9nSNP);pnI_|1j57hxqvoRHKX(h{6e$8RRibLjT4?Ayj*P(7@5pNI#V!HE<LLc1cf
zK#2~e1g0T6IMGIeLyATzy%x_NG^P(BKzpHfpwO}^3<Hf4jco{juaZ|1?7;|BwW9b9
z?L!O+lG?`*3P^BuLVs+otZl4r&!s-F?x9b>DUD$V<N-c^fAgw^?j8{DOu*g<OT`~X
z{s|-wi)hcsI4-A*cdJ^JDopGQPEJmce-CorAQXL)G!1$MTEj)Cau`*{qsr#hN^9U%
z+V+Sl2+D@_aVhLjK5kfH6)IF*m6g@X+Qy`s5gqq19%OC`WT;iMAa_He1>)Z`+lDua
z;A6={11IMjH?Wh-c4&h{2VHart}`3pZ?-#>`Qd|gX#fizU6xWHB`l>r^q*ij_H3|t
zM;49Z0rn|q>)zdo5nSJ1-B`JLZJr^_?xV0zyW?X=70%G`91lzb{3yc#`Q|ZV?IJ4;
zLnT}I$sSOXp*_HC?WoTbdaTXFGGoseW8=v4PhdH5-CM-wZZNR>G`2zq-_c9ZJRrzK
zFUXhy=Wy?CMH43zy0Gu|`UCt=tbF#VJ>)`46^)?^M_lF<w7Z$+RqM)?D{A4p1*mC)
zE`EO%n++w?`#yOUhNE~c-}PWKjG@=9JJ^9@cjX*G3c4J^$tFY4Qr7!zv6e*msN6d8
zt`6S?n@G(cKe@dEd_ovktPxC)Y!xuibrk?zuHhQ>+JiO71AHj9)trM^B3&Q0m4{uT
z1z1eLJBp$8#OYv%q6CrF6JbBluP6hJKZKfqlZMT*rCGG>X2~>WC2;nDAt<%u?t}1q
z=!A-0xo7tzHi;V->|vm(xI-U-4T2k)7A6C#2g?h_GA65_j6yj<(phops4r$vDr!)$
z0x9#Zi%k)&Sd}IRp%nSwv5fa(VoqR+!c_sEL~v?%ur2gz&^qh2mF?QvrkW_AUkH`a
zlWc4%P+CV}#da%DBG6cNR2h!LqY9u`D}dtY(}cq2yW(L%3o1PTaQ7;aUr|zDdnfGI
zH?OU(Y-~E$)>qcAWmUoE#wwf%S7&R2XP*_!9hVaX@B+VG2K;Zr)e05P70PTdkQ9|6
zY%}ghCOQc<%Pj|Xge*N}xlyR(>m`~FUJp-a-n9<}D<yC`Yqtbmh1+6w3@C>bIff9#
z6y#_?y_kqje**yL8y9j;Juh1`+`lyVCziUeEL8hXzLB?+mL~!C^F%3^jjP&A2oN%+
zD5kbr=rsK&=IV~PdW~NR>`qwFu|L7Vj0(4&;{Jp)^-B%x4wnXcqal4z_6*`NLZ3Wq
zw{ER!Oky4@Yn~wC|55qD=C1YWyQl|KH~qj`v+8wgh49p#^#f~NzD+-tE*#M?S5*=4
z?O})OT0g}9P$@6<oK)clxSlS(>{@?ntu0NyV|F8W75CSzP4zO}Ow|D7WTJn(s6W|3
z;;%vtSJXog9(-9#SC<83E1GokcxXDBZ|8riJk(-Um7%VpW?lUg+Lvy(`CZo*?@yj%
zWmHK9J3navB))*JHwgGe5@Z1s4GMrD1nHnAx_EWNP}U$UI(i8zjKYiUtxI7PzGY2Y
z3A7)sZmgzaZgOeR_8%n)32&f+Z;n#vX723NSE3Z6GQ#<{u#RS_5+eKp4HE-r;WIBO
z7Ux$<?KZJ6Pxs+_zv<(Q(2MRy&afPvj+}Za!d1Ql|I|xYgu8)`qhmO{;Matq4f`yi
zMd9|Cr^4`&Whh(@ypK{WAXJzDpN0M%!HS_z80QZ@TvOs~_TZm}N6?L`^x}Nkt#A!{
zlFr0=J*X2Lw!-tM52Olur3q1E{Mp1M2CA7!kS}_r0;J;{kd8dD55J1u&G;A9@xO9s
zK-*?Kmv72Geg11)0f1wxPo2OJ`~06F&dfUsUp#wJ&R|$P{{{|80???vc<}t!<$U@@
z|C?#_+y%KPKGqlgx1Rqxg32QMMHX`7r>k%m{u)=B7l$o;PJ?dnyTa3W{u_u8#0u2n
zyZ(8Kkm$vO#O8U1+Yf&pKa>3MQG6{^|HGf*`h!FMnyL383+iu3QPY)`TFReEFK1HS
z!;X~f@NZHLsUNR>p_TlcFgWtrxZloTaHI_Md{+0ZlkjwVGIg@k9v!u}b6@xWrqQk4
z`N@-?!l?TYM=ppkzyW@>9Ud>{zUtpSA0;IB^?&Cz=?-(>K=2rouJ1fT#@x5zW?X&j
zkUDM`0ql$hFmR*Qa{*0Al?cwm+&8kM_89(P;Gf{cbZ22<wThGjk~yl)2R4Qr%G@_(
zc?YL_2$9uHotk@5q!7*?gMGoRDcY6$)~u*4x#rMh<Q$H!xuaIG-H8mYRk@kB?A*8L
zcX0m1B?i$umqMPFb_~a=I7c8uW)y53HkyFWQ$&8{zUwVIa{}w(Y1BtB8KTYI5t3u*
z;GvcRYiI6T@NH<|jQC(MB#{V#a=EWOLtDXZ4TfPvi;D197uJoE72!fq^$@JXFu(FI
zOgGbdmNJBpU&X`aImmkm$;X{)51oR@Ptfsda5GjHHqzl2`84ZSl~Uyjj(FX+GeQs`
zbnelPm3Dw4x;Xwl_fq&owSeORzGp-Xb49s>0eRp@lAU-v2ynR11+tGnJg-NxOdAU)
zL*oahFn3}fxQ9bBdpN>9mtgumw7^;vYdbJjyIrKppz-FwPd9gFf509}K`trxZtjQr
z4;hX;>MSfkJa;roj8)Hlwckda5mIk=8rXxDr-7d|tWxgV@y|9-4?F8=9F%-2x!>6s
zrM(xq@7KkiSKF^e!;$i@%<^93z72KpuL$4@!`3{~09G|;Lki{t7a0Uw245fph0L9~
zZ{CBd@`tw^R$Z^0Q;Uq!Nvctc`_RN<1Z8RJ1ui7_?fv6vKoe<}#b)Kc_Sa^1JVVyF
z7RsPt+v9<xQxb8RswYNw?i==p%%4Ge1~S`wG`<lh<_B-s>iqeIrw_9aN>V2e{r<}z
zJvif^o&58>1^vc`q=l89Q4pTkXGV9x^8Ct$@fHCD>80ayN0OP07PwRa1TPsp!xdFD
z9x~RjA4k3-!suOW`j9OriF2`b&jk0Jf28u72No9}2VCH6XEe?J+4$kw-L4j{*qP0T
zX1lY~PK>xa&Ttmb>54t@j4w9ps+!x6c#)Ux93R-Uhv~pO)5Vl-`>Qx`4m_Z5h|@Xc
z|G3CM^19v!4rRP-r2P-CwYz7j*xw&`XQ*FppjBM{6b@@<?tro=ZKoTHe&VV&=_jt5
zRSy>P0q{D+*)(l3ryn}oX4b92RAFwbI5<uSn3Nc$uW0hkG^h~bk_z0Y@4gC$C_@7w
zOF=d~5<Y2W<(YV{k}$;+MupmpDY!soiI1%5?SLzeIY4*}*|_FO#5zg>UlD*+1Yo_v
z1OWy_9X{fp$M`cDb9H{q(p#^;YnY5z`n=<*{vs71b|}GLzbT>2jBS2#_$TX(0DJyh
zh=mIQF&Np=IDFCn?~;v-g&X89%}RZCpw%zE;!+Ly^L%~Af?(kG&jPN0amcpx|Mped
z_DcyOn;Mx^1HZGdaK*aEXl<H5Hy?i}3&Vn5<}O!L&UG4Js;2yQhB@F;RsDA}&g7nV
zsVe{bGaSp8ssZ1h4&qzl3`ZsitR|I6P`XqtDt`nsZx*{NbktK#8i17DQRh;%>QAbX
z-{@c~PIDE$f722}e{iW9^Iz+VtO0Ts6XvRskz-^S0PQhCepGaTvEE~|9piW&Tkd)6
z3BYf2BBUO5oKMOk=CVKFLN@i5sm;N{IWz`_!eAbPfJ-$}KblErkRc0T+h-C5T&jlt
zFhgs=Hcb!)@RA}6xK!;bl}mR`h&K2d*ouT7S-vX5fXmdV{@<r|4$r@#b0J?m|NiFc
zc6+_G)~>8OYwMMbt*hP2*40+0vc6?sb=uwa?XA`t5?bPf$rp$J^;eK{@|!VE`HRn9
zJpVO%KVJOoufP7auf2jTf39_WuDzMD;3jOM8CJ$T>-M}W;RS2US6N%XhW?vhqs%);
z!(@l}pSk<}NB;RI{Lx!r*+%|Da?{yc<X1Tne2I*yH+N?FfBkN8X89`a)O{q8--P3(
zthH&qwEMvacW+&XOEgxO^;gMrjCE@QwNaX(SLsln;a30*Pgjifl<M5wzkUB<<IdfC
zx34e4^UdcT7D>;5?gkbphY>KQ+5GzA%#tVtnRA!o%(6JMoGa+OIJ1<P+6kRooLT1f
z%<?5;K0c45*f(!GXoKRc1D4k_F4C^h1Mw~$%yKW%RC#KCpG{dB|7yH;^#eG<4ocdJ
ziG5+_zInW1eC>JeFjLkN>K1zf4oc6!O=WxKCD>IdmzcJuK*lMjI~O~}fHMe>0??cR
zN?*o5q5G**FR!jFZ!#!>zmeH<+AU^k^$b9}6V;!nCwLCXW)z^SeCa6`s{Bm{r9FMJ
z&y6ll_EYWCZ`V#7^E!Q1lRjoTME{gHM^yEdcccg_^Pbki2t;JTvqhjXHJ+ABnhV2e
zD9mLj5{a2uw#ucDIX(!XZ3vJF!zv!@D-UgQo0N%x9yz_!`o2GUXnuRFFB;@2Ft3fc
z{cwiD9RdlfuT)o7R>0UpR<;qoZ6gR|Xn)d({38J6*H?(gJc5gyfCLR-=Yxghw7yye
z@}*QOLXyZ3MG~P?uE^E~!6J}cUgjQ1zLeDueILX=^<B9xpR0_AfH_@+MHFEXFUo=r
z^MxX~OyDNwGd*Zy-XPK%cpGY6ALL3w{?k`~rxvtoAL~@I&HUpt{1#KmOrloq6EFTC
z7(%nJWD~V=U+YT@gj1T+1CskOSkw6J5@_n!V6zMX<)Mg;;s9+0Cj-ziz)g0RbKj<#
zSl{BIIS&XYj(pF3gMRYrKsXbT`3noVZ|paxG6!;B<%*<Pq~rVJsWZb*nEU43R6;|q
z@BBbG6Ui63Z{)3cVP?)t56OKi*9bH7QVA^iR}k-i&KWkjAH>yoJ`%)bo)n<mH|a4m
zM`93{*G*DV5GlED^h3hmGc7s>ae1vIs!OnzU!MYT(GV#Yc$_taO72Ir#FL!qBlh&p
zT=u=WZ^-Y*JM(1UoBK-t@oWHC1yGXv+W$@#05<pa{Z0x(B?+Kb(It_e`|59>7yhk7
z^weygJPyiz)9O$`2ut<Hqt<D8{`hS<g&yP|FX`GD2V@GO=gc#*QtroXQvyt9o#7=%
zjV1i7Pw+}UB9Jj?xo_7Gt>lpL%rnMI(w1Fm%S14{o;f<8JsW;%e&b1K?nmmqvYh3`
zGf5rSz5;+$?pykf;3<haLm;ls1CW}PtDpM@ZY^F6AT<+B;LWz|&4oJur?>JJ>U=lG
z)|CxElD55$9dxYW7_q4pqSb*JiUd<Qo?o_3$i*Iix5zf44khjl?4iA0*}6gB&!qu$
zDjj+L$-wFKkgbkzly)Bx1?k{t!$4h1nwM?-=MiKws}hKBV4v6LVfdPCg9Lvo6gTUt
zrPyBpY;xoFyLTUOGS0pqp&=OYuzKK?5N0t4vDhPdKuBkA`R(x#)or+)9q^e}K#Pi)
z3^riL0(<PPp{O5FND#yn*@$-@*5vE@^=3m{6qe_lSY3PQA}9#81a_-42<hO(0TFo1
zTBp{1JZSCRg`7eNv6%;@8BmVKU|bW?MZ}4OwRmkp)ygFqwl!VqftM6(w*x;Ul1MYK
zJ;03Mmjv{`<#_Er?Vm8IdVk#OAv&gOw;j~I#x_)>{x#OBA}^{?{;#x9-3m$4fJx^$
zor-D*T8XZ`ISxoGfEL16AHYY7|3xQ$>fH>+3RxX#tQ2dl?^u{u1na%Fa-)ifDLf6Q
zPK3CYz_ka~iDL=yOBcg{XH{?D36`b$5wG5lUkC;`^z;z5gK)6*Ar7UY6UXr^)nAos
z>TU8(=Co=9gy_JlsGH1G53Qzb0?<%Jww}z7c-ZaW6Zg342fZ4Av1@DVD_5(l>uVcV
zW#nH%{aD?=A0~Co)}=4NFayk9kkcLeZM&t~UtV3?T3)-lY48l$x9e-mIJiz1r^O8c
zWmD}-(?Lkg5_%Q%cj!4?6(g$UX5=2p9f{cV+^9cpiM0ESZE|boo{7dcL&yc>&KbEt
z)}C2F&-Z%+r<S=9rKIaT@8n(7lXne6dhX;+kk`Mwou<|-Dg!M&LUTBDIxf?hJILMk
zh}I4P0#+gD<?lVjji-ozNecny%uLzoyFE-ii%P*dwg+R(cVcp<lBJ>3bji)Yj|5TO
z4<#B7Q`;G~oDMO$Q>tiUI<2m3R9CjQHa8#(oS=WkKkqsbGE0Dm*=EE%-x*Kl3RnDK
z+>TJ+IsRj5cYqb1Q8_RKw9aWcSn&BK9zIf9STcI65{;q!EW~<B-(k|l6{rG<JP1Kb
zfRp`F2=n2@6DCQPAc{s!%=eVWweC>m3KY92InP0{Xb)nLz+y7_+7L=!>A$Am?YeEM
zN71Asf=R_1DK#xqntUKOOby`gts2#1?a}SMTla5QhaFB<yt7B0nRIDf;p&0+02FB4
zG#gS9Vy*Q8C?R|ZtaMvY5CR`n8=In(<7!PTG6?eJAci#+Dv3(qMf|^NRp7oV3Z{OC
z($(j6ZUfoxNuQ=#$8Ip|)Pc7+*CKE91Ucg;^#sV&<3)a_8IPhGW-L-l6d)*=0hl(s
zs&^X$yX6e71BH$Mw~shc0Szz<QA)!nLOY+JRw_~=gh;Be#>Fkt0u?wPk6l<enB$sP
zr7bb4^6ry9h6BdLa3oWPql)FbqSx;OPh_%cJ$7kW@YhtqCj?)Q*Lp0&;6aj^qRNY3
z0yy2Sz@DDiWcmcQ;TX*aJOXKkXs;-tl~)N{M+rb~M$~zl%J(8f7avTQ!Pc>SRCx#y
zPnkY40i=dNp0KOd`*<=qaYF|YxF|zSFquh*V^EFiCxPvz?S69kLAnU|Z%lr-4e1)%
zFv|i+_^E7a?Sb3#RC`6jsWSo5(2=9xHnd%pqj84|HeZ{}Mm;#He-7}JmVn+E^t8zg
z6=_LK#g5+&Yf|@08@rRRg4)8!;+R!bEHi$NZL-Rm=tA}oUp^%dqP~bC89MA)h>1uA
zIx@1MX=#<&9W+J^p2^9^4r1Fl*>=P0Pwzf>y#LJF-F^Jg<LlSe<&QA=r4Xy{3`X7Y
zfHBTOkt?5R$$^~3LHFAr0%6xj=q@R6x2DTc-`{h;XWg~zAq2NS9@<{jdRkL2e9wXY
zWAg&%d)5QQJX5NMjuTPL@K9-J>}8Rs01C&&2*LD(1s%WE+O37C^noXRw4O}cCT#1V
zZ;!D#`n(1Omj1kYr=;zZjSKY7!F;XU{WHhs=H$<$srwL#c${H%K{23LV)LI2E&`MV
zx*rNcI~v;q+<<wCElDT2QLK0fk!!jSkf8UR=m2Zli9H~t7QEv<=mY9|_fuJplXF<D
zDq`u3CxU)^JS6cC7~e6b7?w^YKqrtHSqlK#?d+hZC_rqS9?KiFIn;cI29eE9+(@nO
zVr`<O?~IMwOU^#o(j^9RB30@wEvIc$rGavUMmr(LPa(D~Za|C$A<LK|m*xz{lnb=#
z<|LKO0U4dKr`0sJMWSHUD>!h_uu!l)6gi<}r@Mz^;g5lQjtv_P7d{39Cvcz#Vf`uC
zG@5X#^ms(hP{CE}UB?4_C8S1@hLTos;9}h@S}1vkm{IBllao<og|_F!s#2BVhH8np
ztI1esoZ{G%hqCCfJI*fNh*3k8-Z0H;9ET097j?3R=)?U)oF}RiT5l(cRK{rDm0Q`N
zqKDK33bD-GRcu2TcSAqJ<r`yz5HZM4P)W2{G!Dct!+KzQE+(n;816m=hW!v(L9$ZJ
zROcfh$}$RFzp8FMRU^SE^^4>qzb*#;z**d}7FFSk%M(A(y9YBm?&=I3x1O@C0NnRz
zkIG%7Vol9x*tzH1?rhK#u||BjwP+N^iyr(b^s(5Z5&ljTuKNu^{;m^=A0|;5@JUJ1
zm{5n(QKXe!M57m*C*MCJi988-P+Q!sEx=5@F&FTwu96-mObWi2GmzCW@<??Xo4Dy=
z5+gA#q$(;dtPW_suhB2$0|QNcYg}t#o8bp`BmS;N`igY}$@d-8r{?vxr+Y+g@Hn!Z
z_6?;)>q)6)?7I2u6`6q@r;C*mR)jX#u{jAQUW*pOqC>KykTLmCe{zr18MRbpJ#j2C
z%8cQMa-osnbcF?G_MF^!(T0?P;}}xhBOjf{+HlCx!;XyyLTAve#O|1Q_}hJ7UQg5P
z>Eg$n^@oeg*5Xn8>DYXtx?04rjK-0=^vt{lMle=Bdl33;w34NUbTlB(hJ#U^M?AC_
zMatq|R9Wgr$VgTMeHt?sa?s$3Rjtw6Y&;dxI20X6+~0J&q(<`Qc#tyVU6$X_4IW1k
z_Don$w_3OLnT#h)J+qdw0+vN|4K4;d-Lco+X(p1?SetvA-+pk5HNV*F`m9LOyDUk1
zvt0HWa`?w`njmu=_J<!};%gTY+=<wiQL9vO*pOk3H>VBEU6iEDTb%#m3M$Ku!o^-M
zL9aq6@pR^uwEUC{7BfqoX7K&s9>h$Ao5r?0;PJvLfhO^KWBNmkE0GiR$T>X${aeUd
z>FshyCMsVe>P_Kme+DUi_Tt}u{cAws!hfG3rS7=}NTa{xpTEMNDa_-4gNqLiU*K<+
zr~TjY%3Ftj!+*Y;LI+dj{d-(K<exNC3cqXIr2l}A4-Ws1e-rQ|7AXa)o01>+c6BX_
zh;(k<HZH8&;eY?9U;A3}mbt3ODu0o6e7+)XEqyc>!YhTCof+<Wn%+F^%dq_Avqitn
zeQ)ftNS!6;v%24!d6=4}D0&0+{`rN%ec{AQ1Cq(in?tlgQB?hQaR#oqTFTr?)JyR`
z2C5fkvmE@E@cFg*79<HCCd|12VYAk<;tYH)_qf@@t<KoO6=&dyU%WU2uT31<6=&e|
z5Aj$nF*~fLu}6zD@LKAG@{0%@D9*sobEnbQb4L!7QeijZP!X?AaR!c1E%;%IGjJWr
zARf=;*rPZDmqY&I4BSMdh<vQ+)12ZAyf_0N8vHaIbYkPj5QDneToh;E>+vx)50;8E
za0ay%XW%+;s5k>x0Yr$HD9*sMMtpGw&J*$C47@l4FV4UpBTkIywZ_mYfNBLUSBHG5
z6KYtVF&r>|B?$0So*|^pq2(zLfyng|Ja=Fl1Tz~$KAfQ+oZ|k>k@Q`;1JURzJuHN?
z4+dO(Z~E~2GdkSQeAQ&|#N>;<?8oiJDSPfu*}r|xDSPfusJBu%y(x!$`jovm!;V8p
zaEAS@a_-KsB}*WR5c$W5wa`bdh*7*M_y|fKDprSHRxKnDjG&7V?H-;W?hc_tA!C$6
z#NCQl+YXPAf$~YjfFL=QhEGB-4>#lVko%YMAUp1aYzGy&+>__n>(08JPV=H1Fr!c8
z2~G$`vMrn7$Ujdaa(NwLp<$J=?u0<%w<f~PoT20*N;DeMSv!chkB|<?I8pEPz(e|S
zgs9X3vUgSCj4egLt35cePmw90Q~y*7*QwQPhDfe!JI&@YKJzt=F7QJ`erM<^en)t0
z)ND!|9%FlTRSvusqkkdw>1bqC5gqDzVQb|02U$WoQxl=o6V*HLn$2{7HQN4QJj6v^
zx6E&0bUHvgkgkPnLDmvgW_}}cOp!g+TnzgN;_r>3Mb#R_6CysuKk(2yZ6-2}oVMF#
zS~(-IkrPz4_6CvPbMSbuY%vFM-i9$dbzlLMWjyE{ATtwD-2<0tjs6_*^An-jy1!xg
zsi^F<Jg5lUi3ifL-0S8kiQQgTIS5R?R-xBObyS%M7)BkLow}#ISc`_E$>jVv_qB%1
zdF1ok1C?9JQdJx|+36Vh+X+!<y1Hd`oxjEJt;e5INdQVF>x+kgwN57oB=T8b<mcM(
zRHjCC@<tKb`l2MZqzlLhj1nf41ZSY%8#`y-D3VoA<VV-k#5-q#>`(3Cz;x~v<brX&
zBjjJTJZ>w*fW2T@kZ266hZL&D4m!oGESWJ>Vfm?nF%J`?c*lCRK{9pJ;&oAlj1*?n
zk*S9ot{sb`V6G)YwIh=;GgRg#878!wFc0S1j3Gh^dWdG;B*U|V>87SPTb7DKXD>6E
zkW+kwFfr;Y>M<RxixjAy0B?wbLwLH~G1@js+M7r!mRv5=BemnT>KX`IYK~VM+3h0)
z1vdgm=S$UC9&8D0?vR|(39p!^s_7T07!C&WLu)|Ks8-%mI^Sv!P;2I%K^;RCohs7;
z$&YfH?hNU@#hE((9~nJJj+BNl>k0|6$@Wl{X%OPIuA`Mh<;Yq=|3T497Kyb(k>1g&
zpOMW<h!OcLA=pgyQ03_xxtW!8WR*ZBYFtt(m;;Ka>4_vK$)>bUs2!1vgyhvzJu-kK
z;qI1Iil+(=#Z}tyiE{w*olJt6IwZznPx=AfKj<zbIk898&Ri0c@$tX|zz6$sqEaen
zmD#{Pbec~3)tN#uvi^NIiV^<J{KOp@mz`#6mD96!sBWqz+P12n+c(;_2^GTpJzJ%+
znRGECwCG52t9<bbbg4J7VC`#Hesc*ur(1l0b|7CynU`o@yN%#<8FJ}jHvn>rNzapk
zO#vj#o0?&=+b2;lu668{b}9o46H;ZcFdK)dR4R#Gmc#lv{~>6E35EMAp-><CQvM`)
z=fFGX(K;1@ObwM6EfzV#vys+=^j^zuF<Gi9IY2Rbz(k1I>dGc?AvU)NvjTf4nNAf)
z&Y_{UMILp&W8}2^o{L=oMwb#sR5p2+`IkC~!yYSo87$o+C||cv<Vej#oy?`@l|2BH
zQwO>WkV^_jN~({k&)1o}cvvimdxcH~XaHZJvp5etG#mNuCjebxI-D7ai~*;nChK`P
z9*+3*4wTuB>L&dJMF@i90=Ft{`|XN{RQ+He3NU#^852}*XU9UukJ1|u2VSbUkbKET
z$B#QcP%ofpcAD^m0mP%y@{y+gqg7}4qo(zQMj3}qMiF>M0ZLM3S=JBL&*{Ekk3*Y<
zpmGbfx6JB{ne$=VjgQImd!EH)?PC|HKE&Bz|HLb^EE(~TB@ex)vh>is9J)-GYCI3K
zo=-jFd;-t_qM~Spn`8*;SEthiO+(M`v;nkEWsF4uHo0TE?O}i~z-<5+ii3Q|q#ie*
zNQo2W+JRR2GR$!+cTIPgkxU-F9|Ls6(m{U*bxrcPzH;8v9L+w)2Rld=^TQPbid+kv
zr1cR-NzVp_fw60DR@sP2`AF5Ih+rzP2j7{<?PSxH49_#wHQcw|u}2YdWSM9LFgIDj
zsalZ>OFB1jssv}~0oDX7A9fzJZ0r^mB5Xt|z{xs-eiy+~@HEV33;;YJTe?IDOwwK#
zg5^1qa0IQ1+@d3B7W5IJPaKCRSL`tT6V{}_A?6B&Jg#AggnEi5I%eo2$ypf&I(Jp&
zJ)%g|EaWnYeSliT5NHHw3Q3wI5(VcHO3+pX#+~ar-yJv0Gz5hqb&8%7ooc9(5xLqP
zIs$7W(@E5D`nA$zG+`<;F=NarDOB2!#=zJ!QVa)6-)SGgd=d^vR5>=nLa~Tg+a?xQ
zoejq7$#}#9;ds>(_Xt`I#-<=`@muW?#;j3RjcC||p3@5KC4t7EVM(>yaYJmn4MYzt
zNIlTV%nbZbfGwh2LKnQ}o_?&6c?>VAQg&~grqoulGFW9K|NEm6l2Mt!n!ZPte&WBS
zL}3Zi!&yi5P7TzS<$(Ym>O6t!62b#KA%(F7n{eC`Xbpx-K&vQRBGLMNB=Q5Nt_6uf
zPbT7~DhJq}4xlpNzJO|mxGH^!*9zK-pnu6CK7^on(Rhf8$p*Q(*Rh9B62p50kH&m3
zJ0{d;5@x5{S?O-343Q=U94V~uJ&1H;^B<nrBM~gh4raj_rXBNMt0SiyRXL&VNset3
zrJ#AuCRX)E2T8M%?gu9X2aPsQZvc!u08%v|C79_gF?i&s{r7LZukX60ej9_NNq$Nl
z!RtFRMRmi@NXo(0J7eN%JjT|CxEkCtSP!9`1_KOy`Of2853xzbZ9l*8KmDRTwF5zm
zOpRG9I84;KE_)2C$%4~^Bzj-wjBJe{GEjLn*0b7=hJY$Uf1nmqt)SI7ucc`Qcruo2
z($ZYn`~fAB@-I@U?D|*{n~^a|%ACc<x`IOK0Li3Ha6!F-LurWm)b<YNTL&09aFKQN
zLPbx}EYk;vahz-7(egdB#eo)5I*g!<$4o*OwKejw@j-o~KGn5s8MEu2Xd1Zk6ueTr
zs$}LJS`4osp#<P{z+PCr2z@~-4GmFkkPeXHPzJfWsuofi4r%ZCrryNrnv&PlM-J^B
z`iBVNp?z$~g6C4g!7-jSX_<Z<H$-Z=pERJbGN|=i4@9g@Q$hZ~H=w}<ca(OcC>XMh
zt<AN~&gyn&ZFOa<yS>)Fwz0jjy4r1Zu69?t-PMhim9=$y?P^UG9b(T8ume-0>gZG{
z;gs29Y*DdZx&&gIv|-FfMFV$nnTR*E+~g_GBo}IB?O=Gg$ZEI;9wD4z=wS3d(i$Ak
zc(m<hY?A`HMyvP5z*LjgDMUh5kmi->VFGBr`|yL%D$Bx*hOurRKH9r4a|u`wK)i^o
z)y;QrEE(04aeKi=%EG6yVwH;LVF?PYRer>3*akUm(BL&|4W>h?o736`H!FXG#)_z2
zyCI^gRaAQBrUu2g)Fi?(NMWY~T$~kXEaqXLTO7xcXNE+(P`EB)26f<%Q*+0`MiUDh
z28>>6j+SzaBy$!%&*Y18o!6hjD=AXkNd3gQ{VWlJ)K7KX9t%h6I#`6GKD;W_P*{}c
z8ZgQUrNdLr3bCoWQo5I_^gzO04;2N#HLUGy{NkdLGu5vR1AD4weREN(2K6qkfY^bd
zA4Jg!8ftn2tFceo19X#2UR4{kO;0=REh4;$=$Z6@8R0e0p!y2Z5yNDhNf|+Dq8dQ`
zl-`F!t8Bx;frk>)L&TPvR%WAp4BgDt2Nnf8j>BATsb{p2gFcYmF{~2`w+IK<&l9Lj
zsq#L`N8()@h9G*_%qQ7+&f4p#31k{i*ANhH(HW<(Wfl0~vekm#g9;?_o7ttOwt%+P
z!Wqsa7z{cX!66W2g$a|ksdZ1CIfI&d2+Rm%rfTA$zGB@M3vo}oK(Rx#l>WsWXYXaa
z)AF`K8g0c=b2tqQvIEqgk^)3Iz@g_sNr8iacwLn!h0yK>9&hT3Js9=v9q1Qct5&7n
zPYAAn^^DpUL;kXM!Xzfpq#-C?r~^+Q>BPdDfx=lguJ^4LEIoRiZo%|RyD#a@xu}R3
zOJ%(gmosHCUZ7R!G81K=9diZc)aW?JM6b*0&Gp)r(wmL!T?Ys3WK2LL9A(T_JzdWb
z7I|QnTugUeron2I&5L$~gq&|uPF?mVp~*1A)y`cxb5nMW806xdP)C~z1x}2V-PpQj
zuUyBeAhZ@xZh%LJ1GsAJ_7{wukH@gKu=5c0!7aVEGz_?=>_1>BDMcBV{4tAeQu`{C
zq}enDX;UA159*UU2<I3JvS`licItbxIyPA_e@R1|MGcDagwmrgvXq{^DOjHv%V^QS
z`&d(tavs-LH>ZD)>0Do5%lu+QKtUI!3o||1;IPQgvwAGiTdyaIQypVuk{WNSOQ6us
z19+NNYf6Wil9XOshvSo_n=VMdG1GYSgKocBIpygO2wW)A3-qZDO=@UY;QEPw341@z
zq{LEDh5?U`CXXnReJmd_`&jg8u`8b1CF^n6%kQqZfX0%T*PC+6UY5{WptE5$tnV@w
z_NIKU+Q9kWu?lFc(-Jfcx`jTXVJak9*4z9*V=-3ur>nqtdc;4E@n;GV>u0$5;P4~-
z%_3qw<CV7#f6jk0h*(ly3zrZ1Ck@QQ?-~)SgO3jm9sW%qsaT{GIEjeGx6eYvI=5~c
z7uN0YZ+|0=h&5OBSmiIWj?cdZ*kE-S3Hyf-&Ghuz`6s0-VSgrDKIOMDA)ER*c;9T}
z-buw9zkvaGbL>iguvA3l(r;8m<zC1BUDi{Tsxl*g1fCQD+PZuUOGQ*J{0Y@K7%d&7
ziNIfYs_5@jkx)3fD57#L#Wj++I!xD@_}X+2s=Zzjm0LvR-cRwx06esES0C`489L4V
zCLIM@ytKP*!Vx<yg+){@_~eSH+#)LXjL;3RAXba0+|JHqtONrx#N%mByHjBR=SSA!
zV84jUHE|<FRBjQKdqya85tYm1+afBrh|1;3RS}hogCse<OhwEVQMo%-5tTc2^e7kd
zfBtv|%iyG?Sw!U)QMpA_t|Vy~y4WHrw}{FuqH;~@Q4y7Ea3hMS+#)JBl}x5hz#=La
zS?EPnu3;E0qH>F<+#*+1&~oyatNIdAxy#ngmw*CXygNJYZqBGSkQBc*Bi=J)wtj%j
zq-=x~LfZob44IIi(!N7imt&}`@v|h-np+GI;<Kg3(=syoLx5l*g_;-yHt8E;5M5yt
zM;;ck;SjP)INisjGRGB(VGYa7<svF^ecvBFG`~I87YnH3()A^bSGgpST*#~&j-na8
z%wI2nDxMMll>sXLD&Plybz_pBfBosvn+=J*IBxoR6|hK1%m)CQ;%+VrmvenlWo}8z
z7!thV7~kBt=T{d&6P?7NzkxVp-RXakrk0jT@F(ktH|L-J-=}~3&$#mZ`<rX6)$O&F
z&5G06?p8L|I;$0Xb9JM#dUb29vvqZQYju15#o<5l-rt*4D=!ZJ3D-Y+@%-0i@LoLs
zqq9aYOnQc|mGb0a9m{XwV*ekKoA_O-JQeGoCgGJ}|Ias?{>=q*|Hu6zn7at({@{bV
zw~Aoyxv_f-h~)xeIf2|Rg1HNb<pN^4fLJadmI+&nAO;QI2k%ZF;5r4wvbec`@z(>+
zRROV#-Y6iJ3y5U}X$>c>GM@<1TtF;qs6UfpUO+4ZH@kpXc7aD<Kr9y!%LT-;AW#($
z%R~t)AeIY=<rI|iSsxi2hD=PrDHjmS1;lazv0Okb13RLCSS}!z8wJGjG$}^`v0Okb
z6Ph{(yU5~VzdU;O6q4{XwDA<4Z~?JA)g=YQazQ)<s^ub>+kmKE)?n_J3uP}Mmix7X
z(FtaJ6Z_vX0RIOKw^J|e&KK@vXn(`{FG><=2xR2a8KH(>T~>QT?>fn9KbhWN07;R7
zhWtTv-o*fY<c2WTMnUgd(7Q&8Ob$shpoG_K+A%FF=FmBuxCM|@0VGw>yB74Wnra-@
z&Q}sh>LQ+brC_n_rm57SV$V83QP+8n@`UtW@yZ=DR%*68+3HUgE{tF{2<!saiB3r-
zmryuob$oS_+|97MkysD4Rp=L>*Ui>jm^)3ph$0{(p&j4LEh96UP_mHhW$*#OgxU9T
zozXYv;83p_^y-+^ZCMc@#otvCSF9UjchE6)Gq1P3Q`IAiu9`0+fs^7swCb9@v2m?c
z1_>>Hy<(AC4;;+)cmQs8&5kE&%;FS?UyBw}zU>IH5oifaDzbpZKdZ994|g~QDWyRt
z7j9=q)B0)#Z6KM8GgxxJIEISxx~fY+KO!MaC@-_T)A8_wOGaLg#L4rCpq7Pda1p}-
zj?)GbEe{u;nb$aC843#-8f~57HbM)tzo@d5PYE)T6+s_xdgu~iAy%!?O~z9pIM67K
z<bcYyOKK#2tx)JNNHJO48b@l%Ig|2NSuyB<j3?}s3_q2{eHa-Nb*4p9a)U)%yYb}8
zaO|yBlN)&-Y21WbQe*&cFma}AO_sk~F*r+NFv-}bnSKeSO$tmEdDU{j&AqF|t`Rc=
z>&#-jmqMl|WQQ!GuA;EK=R{<61UsuxW{bF}-i2UFKjO`#a8%@P!rM4FMORNx%!Kvr
ziuEuc=Pz5Os7Pe|_@eBP8TQ_BI0gy#J>g$eZ{@IGzkVHDl_;p3Oued-gr_Udsw8*A
zU|H6E%sBTw$L{P!q-+4eMVXf^tE|Wxmv*dsB1>i0Yjhnu8Y3sWzOk~sxy&_p@>0vS
z!}=Xav?HaX7zIr6+Qlfcxz{cxH4R_8EU>FL91xug18j)m9{CX%y&w9aW=Oal|42n=
zSu#7Q?UE5j7Q516di-SN5T&H)2eV@-GeRWmRmU=BcGJU*5$Xi~2>pTv+)|&)qxwp9
zWf=lC8rUGrN{^@YvUHo1yz7%5f&u`r001li01E)Xf|k3W<t}KsXPAZwz3rD80B{3Q
z7`OcPcnB*L=QUH56l%Xp&U$4k9e8S|lHPQl1lSAF-iN)eNTIEE-)SF>d|K4lOCRG$
zn7G*7^xXDLLp;4hzGCf#;dscci;#^cmNa%EisP_vhhnLN7GCxZPO=$x++IHlg>L?e
z>>1SNwc~<8J_32xz_+3G`_2${7ItyMJt&snbh}bD_dq|7q_Tk92+bv|?LidR2qQoq
zH9BrPWA>-60`dpANccTA>SQls_G>i!pY$CvR+C`f7NUDYge|tIW|&VG0%9;8a)fX_
zz!!ig-R`OCFggN2uflK&4hiVI1KT=sPPu1NyA1Af_#X6IT(W6`qU}ZpJD+fW<#eiv
z8;08<=`3`nJ3@w?-j|W+*!U5SU0c0G?>mp}feZd5_#sr48gQ93KzF7~C;E|)o|Km+
zwxkI$^1MY{Z;Eie?}2<>(=qE-vcZ%i?WEIvtEklmp|89W5C{4Ld=FwGtxRoY@mst@
zH4+S1n<zpjPwic2MmD8IeA)?=G<M{4qsBOL2N<s5$PXf`=R^&inWHzy5GP!VE2OU)
zug)EZjvDO#6{>xxeNN~CCF3%y$juHmYQODbY?Kd68l4EZLimE?X!`xiFCnGHyQ|0O
zy1@B(jENRDdV$?ho)Ve%i1JaTDYs1aHreN(*QLhdN8vu-n%TtEi?we|l^fS0+2cjn
z77i?LjNM>B?@Kb6EL?uol|mDtZ^}<cRov|iAc*uljJVTJ?fjDG%S+E>V>#LXFJF%3
zqF&WZcc`M34?)eN?&9L2`t*x*tv^$3&|@qL&$50oS*Ony6S<wZwY<85{U%ZYC!euu
zneJKIsjhWD`vLnfB~z52_(2&j%hDCqgRyySV{LO~8^}Y`HQiX<-dJDRNJ>c84Yk_5
z=yg;2(~$q5TCuKI@dKRuDWzqVq1!rOf^&gQrCm|*-8s85)=IG{-lgkJ=xyp)acPQL
z7-7nA^-$vxI0NU{<{FnQcOyIMSMffj#Py=07A0{>YMn9Ja^)}+A6p9@YZtrIb9bjN
z>khekftjPH554*2))Gg!0K)d$M>{*#CAEWNa|B#RQ_lLI%1rzao?8D9CJs!~P@m>l
z4AiEk+k#4a?88e)J0);w9_xEVmGtdnIuOUqa*=~@=m12UVCuGD#&un+s>r;bJS3p{
zR7V7sJ{_PzMq$hPnAIwY$6c=tgO?#JxJRmni>sTo>ekn)K;?}16BwQ794D>Qz&&zc
zy`&$d?LM?W(eeyIZ{es4DiBWn9{<Ie%#q`aj3LOk?Cy$#6&CFfRwmyKOT@-$m^L=i
zOsGCPa*z2kSn7U&?uyUUg0ZJ6l=L0g$Wx~-akK2fsBc^Fal^*&oB(|WuRWU4^ZgD+
z2d{Kt%AqwO;>lpI$nBh@x-VdvKy@$4%W?jZIdn^W`&jK^Z(6KYrQ&%$a5Ak`e#H2<
z8yMaOuUTtqwKoAmXg$qs^cI$hY%ua#?b;2oS*?QuwGznMLGM0K+i1(xpB|KyEB~Oh
zd{9#Ec=@PY>hdw=7C$IG%NE8K*4K5Nq~Z0=4f_6ho`n^zUaiSXlRj>&at!v+3mB}x
z4PjBKV-c$y4t*L{oh3983s`#`9HS3fPITfpdaT$x>{*HnEoI87<#R*A@mEJ8S65cB
zk8wsQK?yssxhV8F$1y|T4uQM@wNaBzk=w4h!`@UyS&4?KM0G?^f~nPfAMdnVcs<^X
z$Q&LULMlW?Vr%uu+c&Clc1<SEi`$f)_5f@3PV>)`V=ggQV6Rt)*fqLsj9{<FuTRkK
z8~5<V37Tp+A#vtqR_G4hfz5qc+;hto=MXIebtvXQLS)Ke%ZDkBikYQ;Ji;fzAsNF=
z2pt-c5hhL5x`R19^l7b#DT2*2dVw23WwfviOpCbkwD>U|7@S3QdWLh<PRlf*xrhnL
z9hn&hk<Lam#`DlbjiG~AopQs_ar|yKMJu_FOR`VwcFl@MDNuHcrbe-4KgIKS>b{E(
z?`IwDbA<0w&>YNaY^CLu&L>0;c4QL<X;(HHgf=?y(J9B6`C%0&-Pr#?&#0d{un;y=
z(=Mc1Eamuom{QCMTv2fu2OhT-7(%ZH^`Q#Y9X#}LR*k8GmbwTQc=zE4P=$SF|A*s|
zY`dDRE_RO*)a`CRibmm1tv0vW@e+X31>~{~F)dx!?DiQ<<T`PuA^NA#IzvdY;QBOL
z4whfn59+&*<p+&?eM4Wa(i`P<5I?Lw6p+hYSO>P(8$&lkk9at8#<B_p1gFr6A9TVh
zV!!n+RBhXrjr}Aeu=E*ryw6^I_M&|FRfb&N_j7^%``t9cuZ#zgh!6Sa`P&lkEG8;e
zk9`xXq_(kKw7dQQ3N-iaSmjy)yH~7@orQ(fs&!X6!?=dT-dL_vw6?%lnENJGrk#G|
z!B$aJ#?`03A?zmi&H2t`Y+7>xJCd8&u3%4M=f3*iwRGr*vUK7A5J(cu>fG0VDGBSl
zAYv@{O{vX}X=^U5D?ges8!KYJJDuD&=sQNX%n^>9G*NL;G3UPeH<k6TXM&P51O>j-
zIalr*xu#FznhP6M#3asf>a&ykHmzID={4qpgQeUzYIUxoujazls`W_xPEPKdbVY4-
zjiX6#NaL-(oU!J<5r6b(Px~C8C{q5l+*kbV`8<HRZ^tTpn)CQ5n+ppcsbt99H|afi
zcySn?xHOelKFQ*cjZHR{1Kxo#PrDKGjeLSr`m*ljf1kFE*#nQ^tOEftcP7sRw8tE+
z%l#nkNCW7;R>v50-E&mVaNbuJ7I=)YLr-q*+qNBNx5xpIm~<)wz?gb;o+h~59Y7DO
z2$bA6?HzGhBdLNeY^S5fE<XH*ti~-)cHbR`jy5=xTqn$Z1OEj6^tf@DPlK^|q2pyY
z_YL@M8KH-#;?=3X+3LPI4yFJ5H}b}z^d5pocxY)`?eG|<4?NPKtBsRKQS#xu)Q#SO
zj}=Q-gm66NR9%LD&L2F|T#H%yMiKz>e#}Kd(ZFY3YIsOosqZrV4}oWVyvj4r1~!;Z
zz+gxxfZKcW5`F9PW-qEg-N*ZHvKn4=7fyA<$cU?a2P;>)B7BlO9Y03qS_chdqj-u1
z2fW+DN2mj~4hPSRyZg8AKWyB&d++vjL@7auOWS!IH(3onMP+Cdjbv}RQMCl;a$ccI
zUZtzkP$ujHBAMt&z7duOS}LJ6>Ku672cSC;9Epg@P8Z>0qYi?>u+fGfl`}nykg6+4
zbUH!mtO^ZVd=Lk=1-LAI+V%&a3SyL}EWs>y2MnW~ZZ+)NBc_I30lBH%6Bnbuo0DQb
zV<EJQ&kY=}jAwPJOR2{#ieXZfbIM3PpS<1bM5hQIjq7j=FXa_e3H?S>50s%EkQxZ3
zsNAahO?L;&u#nb;14R|?NRi!H#3Skk6w*h}mY0@35A>K<5eqW|7o8yRg9Gnr7<F)*
z{|ujQ2Z85XYtInSp{!Y%Z#bV(k!(1+g$T76W?jPFwd6s3LTRs71Rn&UVL|v>oIdWz
zO7xJ7zPAv(<VhlBSpr@goS5|zBVoKAR)`G8>^@wY0@g&;t~fp!I*VffEPq(^Y$yt3
zuD<^1qgpCcb$h+VFw=<JpEzyA_{egF21=_X8WTuVT77e&eYM_U0qh({y#(dYcg*@f
zEaOD)f8?M434flwDEEIeed_l=<KlzE|AfE&Z>F#RFTC>B;s45iGRK5c-v5Tnhy2sO
zlF)GSyJq10Kk)Iv;s43M$?q&u3RE{GKk)79+Dp`J<HEW<|5Zp;%DA1YeysBsTFFgI
zv^WA$BqRY#F0wyx^2Oj_3CCmL5|$#ZwIzHB*&exX*IgCVSOqdfCZMq8v~3Bo!i<-O
zG0Zu!5P7fH_5;NBz>_W^W4UkYk5ofCJ_Zd#gDHx5KVv%)TEOiRonoLfB;lvn_c41z
ze{h_C`P8XT$ha9}H$q!ohi*Dv$Z$GJY3@hpCt}7X;sz-*GA3>oBa1R}vDrIg=&ZZl
z+&6YRX)M|)QU~dSDQ~AxY+`#bJ*K&D+NMzy(nK?a@wj4U?E%bzVc6L2f;oZW+P1yi
zx9f%k+t3Nv^uwl(Md4~M=^)JQ;cwDTEut+4<`L{7fPM@|OdZR88^6i+a8C*kM%ALL
zBllJQAr8=Bf9mj31P?WvO_hANpts?Es6XYtEoIoFiOXPNfx?8WHLSgLYq18Ts~Q5j
zum?=LU-M}({Xs2g)jmGnIzH*09&Vl<cGjbfQSSTyt&{L{J6Z{bEBLSH)%I)AaJ1ny
zI*oqr>;9dqer>#Q+Fv_dU3I-K^EG_03fI!&!h)Os1@Zfq5ArEy)43tVz8uT`(#`?{
z<>@iY{UCfV<I_u;AT1ErRPLK_rF^loG53ww6*p_f9hwY5dGQ<<CfuO8Z{5GlIBMr?
zSjsoMkoy*HamJ^7y>t1C*%7bOvVZ$4U;CA>@xL$Br|?d?eY1Sosc}u**$W6Fc0?h*
zN=v`c2JV;!(r-vr2>6Kf=6i98rW*Fy^gW2@Oblu4?DD@_-M!e>&sfq(SC2Ob*<Q%~
zFyb^3S(-8zdFVOe0%NF0)(4htUi#qr4Opr_zt91Bp>wp~Pn@9Q9i7Ab=!}2%vtP^G
z!Sx9sXT&r45QoPDY*CR1N1>DiQeenPezVh=I=C**jEamk9BRM%sYr2Vbl!<tab|=>
z!~SnE+fdH4<~TF@`qQJ_FPjz5KS5w!&jD<d4RDjP%!2yjY~`E#=6q8v4_xjMh~XfT
zAQx5si}Z1*oO=Ev&j$IY|M%(Nyb97KzrVh<vf<b*yRvevyH(lP*lJg{I-QNmmgB5$
zbgpe|*(+CH9FBPJ?;TAH;EThLas9Iw&wpJ;^2PJ-&mPKbN4*FW>w$HU!lfjA3j0s2
z>OUkm@w?Q2>Xi0X43U>Ot9^5(QfcAKVk+e|uB|jF%!o`#$E}G(GjMGlk^Vd)s%BNl
zL`=;SvpCrvY_|M?nt9eb4rIpOQ3wpE8@J!R`+#v~v+u|8E`ZqyKUYFX;vAuph~6Z3
z!XzKphz9k6r{gSltTTF%peEYL=1Je$%Gw%{6!!wa8z7GSF=B4qV>?{7_Poxi^%LJa
zK?w7ayKLQZJZzCaBAAE@{rj=s_h7NV=Pz512T&7-aGa9V+Ope6y?{tXaze_l6$EW-
zOa+u}2N6!X1+a)=q~Hty^a<br%6oU0B&={~w?WGQz(_1x9`wDS8`2$<)Q`J{o44Nw
z42K^aF=8~b-bZNo-d)Md@GN{l*UGZz5GsgI$IX!~h&?16k!*=sSJ_26_K+npw66uc
zmLmlV2u5XTH=<Sf)_uN}F_oFO05Hf!gey<<QNn#dtmwD0hvd1(soJXZBHeF%;HP6G
z)yj=Hf8{3k<1t{~T!bkCiiFT@IC3Fk;S(lsJdOg#iK<VpA`b86#AcE`Wm0TpJabdq
zNl1CGWj9VzLcWjTcyMD+MuV_b1nNN_?Tve+1zBIVfcdwL!4hH@+&gpvAnur;&l`4l
z<U}oYDBxcOmXeUMn5uiY5y3T9`NrO!l1O&R13At>Xj$sPdyd^3LvkP7xJOK=IR~S_
zYLTrRloha+$`V-zniatB)OWeXyD6Wm-11^QD0aTZ&bNq`d@ybu<|JAYrbR9`Ugw0H
zK7V_bcX8VI$$dM%zV?;%Uf<2H_nOuKY!OS_9EH2OShShRU}E^qR<>|g7by-oS+p;M
zyZTQjyq(5f%^jX-TJFv+vFEq5y|J>g*<P(&-P~TQY=B>(vejL+E1i{Xr@PkK*zDLX
z<@r7Qf41R)RleTC{}-R1e{-r63a{=3kt8?(|MmQObuTg@F1;IfN?;40?WE84%i_+`
zF6oPWi-k+Na7m{@AGFv^I(c*Ik}iC<h0itsAvujul_SSPBG48P$vTt5U2OkKMTUEk
z;r`MgRpoil@^bIB{f->?o^{%o`|98RT+Tyt-^{NsePw;?-zw)V+zD|T484dX@}p;k
zB*N-wmRPmuk|8ms7hq!WV1xsGL}7#{IMc_X8!B3@yNV(Qhx>p|`~+ZMQy^$JL02?z
zcy|^84eUJ#-$2-fc?9TXC43Gef7C#D0r-tT0D>=%<Z~54Ee*d5&YQr)Id$q=Of5n*
zH4rn&p*B92V|2CM4?ehi>pIGo!*9V@n-z~VGe9ctglq@peSdxR3y+EFUfb-nQ{f`;
zMhh{piQ|AT5g|94G&aTB#OokU46AKmRy6dtCh3rdLhU9vLqZtVWr{j^l|X=d;Atje
zA}I_c`45GV?93jV*r37{PDkXnE4YYLtJw^}uGDs#U_5I!`I-h(ltO+d+5~<d4cw^N
zM8uZ7!IVitHJ3Tqf*oQQH&jLDZH@33*(nf&$?6?=&1Sm48VVx~wNZ7;{1!&11GJ-R
zy$`x5wggos>xVE`q9KcbgA1JAC|XplfgO!$<sW$H9pEm+T_R~bq{ggTdxOYVWNXVp
zLN#y01i?^aj~oCB)jtx7I=t;dj$BLA*2X<*1i^4+q~$?{lue_b#|#_m<|$#=SZ~A_
zt)!-6I9hxGne~(Rloy%yU1JzoMQ9@`>y@n=mAk+b4aV)L29hnr+}VR#Tu4=DB?v#q
zvJrY$$9zk~x2&#b-dm3!)?}3YRmo(1@etZ>F?BtW&-x-i*M_GuHL8<0iqO^<g*qW!
zKt^DcFrg%Hh53k}I`c-6ta>6px<+WGFt{;uq0+*P%MwRg))nin$N7!`qlw;z7_gm|
z1&Id1i-(}S1`vefL6?5iEGg@XEG08|>S3mdgLCec2FVD37O#scWTY^oj!YxeaP3$e
z1#>MKsvVh(nV~W_$uObafo&+dt&D!u^BpDVAu<nU#!-f62h&YWZ#^miHv%J{dcfdw
z3Xd%}&IlUSBv5huuP8XMpa6$bH#Q+a>yL#puWPp*07KSb6NEL>BemnT>KeFvYmQd~
zTjUYQrm#+Tp+68#3JyRangfYf)b~5#74wv0meAj>HyhBR(8>%0ggRE0zNJ<GOBnr-
zFiMpNkNv4KE&4v_C;5GtpjH8@syk-*F9_*L@#dv`0;)^{8m`s9%36exLv1O8q2d$`
zJG!WTB3mgT2Bw|39vI80%F{Q}b^wPU*zURlW5ZPjVUTQ5hd`=|-3&OZuz3(5sjzNN
zguxv=gTOCGML{!27EE>n&n05DPz56?P_`7cIZ$L$Ja?98C4^XJ9?<=R?n0_9dldZv
zT~FK@emwBTsxK!hg^MWN2R;n}d9aG0P;wP^piE=1_4_2pv<GJ}E$Ued#&_OSO|)%Q
zKez9L;Mn~Xgo{ui1QtX1h5)DOj!6^BmuFBpPP{;udJ{$L>k^t^@%43!4+tz|5>?T>
zHZX`Z_%nOV4Xh@R%OEJQ8f9R^MB?WSBV9bh@7~_%w%1c;4`@miIisO}NRctt&-w4b
zyF*^X{V_wZ)p)Nd8C9f==R8`cKuS_|Zwx?RB^Hqb<6fcowg5wFH3eHgW;(QkPu%0G
zAM|Q=E3B=pu54CUwl}w~Vur#VN~TlA=`wg6YFp${(Jn&1+kMZ)E&!v;*$VKn6M2{U
zmu80G$r(j2gM|hPlZ0*BC&Qey5;MTid=H381l@&GYS1<!WLv6_pm@Fx+?b%mC5I<C
zH>p#5efbV{NcP@=2ZSjE9-nB=Niz}|11^T4vJS_?5ld6NoAMt01V=*gZe|{x#UzfZ
z%M#fO+z#kb*Z7rd7sP>=DlSw)rELfNamOd`t?%tL!P*(wL8ay6haasv!yh%RCu}Sc
zvV?k6V)9K{7P#Ey=X77N$DvL_5Eh4+0^|X;I%A@FrUp#ah0pIH<Qk-}$F6h2*lQ^%
zUYR9>AYPl3hu*_G&_O~Ws~(BFR8z2<VV-*CrMMnUL(lKD0klqKj70%9xnsKRVSq0m
zkKGU*4dmgZ9yg#!Z7{sLpm>$gbT!OzD|bzI7*6=)(feR&he9Nro{<g>F^>zHzI<0-
zIqzwXrl9L5bOEBswZI9%K;_yErGl}Ap*V-T=H`Z2t4a9>p2z^E0tfG%3A$D`y(yzQ
z&r}y!b65!-djx3l3LWg=hLsgutst?h{R}-oVa3Xaod+!&o7mylh*X5WxkHsZ@-ZUx
z;$vY#ls{w?eVGt`Xqs$DBCuo~fs%!eV3r9q3sg4<4Uv5DDIa4Lp!x~;Ux72k`zYjb
z4M8Sr=0(E&>S4ttZwQedBkX`sO)um!8LxN%J3x}W5W}|7d|8@_P9atW>dJMU?~a>g
z8iK;GyQJp?wU;}f-XLUHZDZ1L6fJ5v{aR@<nvkUcsdC^T_r{o0QmC|H2@*K=j1<Gc
z(s$Y*F^5veYf<Ic2xGR9;&X0xVw_)APsSq_2*<0QxJO_UhOsG20Ke59fzhK;R`lf!
zd(d-QfxU#~K-Gk57lhf^bcY;GlnDz`4>USB@IL`FGzU7hw9bCdJ^ffC^B7(vU2p@i
z1ZtV!;L3!RQ7t1)#|LjO1r+_Bx<{6N8ZRkPSc3F$)^Qo41_Ej+2MY|#T0=rzqWln=
zQYJTS!f{KSQRH=pR>7{1&BXYjiw`_624hdeO;xT(er2qVkiI7rSEUc}TH04bT0rW)
zAq2&X#zRz$9PU<guVW9NBxcA69*xD9%1N6%$?Z6ZYDyU*2Cxvm2a#@U{=*Y{B*RJB
zp{UHZyb?+=cKtmO>n2J;^O{Yp>WvPvr|SE`m`(^Sw~Ze8vu%)2Yg%>L|BFcPzklm}
zeb+4-jEZtyeoApk>pN&Z#YJ_)&Pd9^)jMNgM_P}u^`XJdErazCVW8kL8Cc~zk8eH1
zCKb2+{KEhAi}utG1T8W(CJ{UwCThVC#e8Y9;PfDg-j_KeTO)`JR9=nstoEbf;5z#Q
zwU}xJqc!HWG)vmj+`u!pv{M!&5j&HX=E~*|I1efRB9+Rnj}@^Q8KY5XjTq|+3PnMU
zqD%D(4izU8P1`%1Z;H7by{#51dWvS5K9E|@HF3E4RJgLmk$J;+2F}1arokp^Yvg0&
zgF68GhW9CMa%}~wbx#aDxbkEd6E`L`w=tj4Vt553EYI*dU^Ams6#7EVS3QK%0WuuQ
zAXhC*nd8#llqQ*B>P@V!DS1tO<j~%se~93w+Q&A+41-z5!7-jSX_<Z<H$<x0lLi!4
z2DN_c+QeGBhrQ^yg<biu2DD4sA8<!$E2IXeOslrBwYj$0S>5iyoZsqhueGmjY;UZt
zc3YjR-IZ>46#<cJ>-O5!nkqWPo;{!!R9WTK(Wz3xDYM7eqGG*t-2i!h#w?*8rp%m2
zkAz%T<(Z@|*LIhzSTvgGS>t+fhM|Me`$%hWyvGyclP^pP<QmOo37Q>IEL_ylsp&D5
zm04Ag=9NlJ1o^yh!Ix!WM#EUQ4<GH_m$?)%3l^<h-F)}Pl2JVww-;<AxCE5MrD*Zy
zBb}R*(oL(V^vq2S3YsgE2<>Kv7yxV5AM-FccSy7gh3g__PzU}v)e;B7g{2AuMlUr-
zi;doCCUX{fy~!8lI<G&4S5l<7k@|^q`xypBI{;EY)p7f*>jHQRuL`vV79~_7R!%70
z?F#7|o2n~y?S@R~>Zv^(z+De7J}Evi+GzaZqLMS^$H8e4<_OkwHS3#;S~aM5aRtPF
z3H<=auM6h~R{Zn^7LwpXowWz(CYijdHfWokcG_FSyk*fd=>apsYo0;%kL>}>4GfcU
zCc$iKHz2zW51rjaHAuI`-bY2sHXIyyC^0=mY^iBwHrmIiGFKm16zn+8gmOzgqm3N&
zf$WZ9oy5Yy_45QOQ>wg=^1&+9h9QWTn)xL9b=F=_O(4^Fx`yB<L}#4BvJ!obZ1WcM
z9#r7ahWAeI(o<VN+iKwq=fJz?M0}hY9CH>O976cy^Z@&bv`wvhDx;_}>$M&NGXj~Z
zO8HPz^@??0EX2K_H>Mqi9ipZ5FXlLVFXNq-x5=(+4NTqY2Dt{`nn!_Gl;lqjN*_09
z?H}M|@t}09KO;+=CCUS|yLYUu)D>aL+kt-JwQ5!B{REa7#Mhvzy{w&Zb;3@VGz7)x
za^UGBomhC)5yqtJeX9jaj~U#zVEU!qm-OabR7A{Fz21n+nX(u!(5kUc&5n5p2?aEA
z$2rC((yCO{{MIKqwJoI)8ri!J4%o>UFK1BAn5}xco*^vqz%03#?z&8a)hL@60Y3`{
zGT)?}y6jKvMZn92-7;4x<;+d(C&VDfUJhCp3H3fU51f^<pzX#cF?;1YP6eU0;OYZr
z0Zu=w<W`iuj?il=EQ3!eP>#hdy|y$Ag`w;}U@0j@8JGMqi*8c;DwCwyGzDo>p9?L#
za@avQ$5@a>b7r?w-<#F3$%6Sy8d~86RVH5KH->|e)YnRnzQ|H~_NFMw7msA(E-o*s
z8NB#x`ccl~`s(KN4>FzW>uZ@`j0jBE!}Mr_!y-S=>aj#`y`Cseb&Qe8SU;t^WXF2C
zsPq}Q!QxJ_;$BH9F{Rd&4%3=W>$P<_K3Tfyg7h0RjW<8&_M4Scp8f!wlgf?J_kA!`
zo;c914uDU%++pv>nUq*6$}kw<SY@Y~3M<fRx;k>YQI(IFeXRPS*cE3t%6c63^1CZ8
zLc)uX@Gn(J_%lH25d`tMh4jO}<e$I7pJ$U?VE+ae9~{2G-z*o{zvGp+4*!P#WWX7u
zynm0&hy0U<yXAL{GWQ?w@xkHW@oxe}$0DUbbyM;K->$A@@#W5~+s1`;JN#Ray(Vv&
zt9q>R7g@*W|DrUc*@MGK*gt*5LDJJ}N2N3+M?P!*1LY4;ej5|AY1widyl=K~@1!D!
z-#{~dexZ<DD8Ol$M3LY*L<H%of3Q?U<<bFHMCD!_8I0W~36<w!SDuYLC@_Fl5tWO0
zDeBQ8Dwn2A5tSRegNvx#A}W`TgN!3Kakw#Vb1F$dxf)CmbrF?YMCBGyxr4A;MCDG$
zN-!WJd!T@`GiM-Y5$8wN;$Xjs$~AE#MO1DPm3u}ga}kxxV3{H+w}{H+xoZ)Xi-RON
zy-Y>!6;Zi6RuPq(J$E$U@_+tVMCBGyxqCW(FbN4SqH-n7sEEofqH>F<Tm-WgQMt^b
zDWY<%A}Y6t$}OUD#phZ?<qmB|#H;hFA}ZGji>TZ$ORnlmMCHC5$o}Hpv*PaNifTg)
z`)dOUK0{{f2gppyMo1yFJwU>b2?;9ggHjV|<&rsu$QJx8IUDAdPonw*<c}>qTWUNl
zBa=S_2o_SPnXZT+!evW-T(-)k&;~vbGCq(E2X>nfKRwo0nCCnU%gp79oZe}D-yc0R
zzdhC$%hBn`sh5bIIKra@AaZa!z$@y2Y+6}a#WnthZ`)|r(EdcF;)V5<Wo&9k0BB?|
z7_!lj#~Pj1SF0;axDV9=`=vcZkxXZiD*~dZ@^5&VY(xdM;FkoZx`;`Bo+a;j10k^&
z$4x)4o<ZKDGPfjU3<+LwjPIKb@tvbRznZ0&e{-Flp_i9H`u~sd=KPbTm;V>vBfb3A
z#wru)*D71<*FZ9~vbkEZH|*8QioM!d-MrfAtY2GyarhSQJ^a^yia+1Rr_Wv-euoeL
z(OH9)B9P}>)Z=_ohwlFuRwhLt&+k&hsnGsciHv@U0RJ}^yZs;ci`eZVc6&ZlXJe?n
zF3WroyIsU?7l_ILwBk`MLv@ST?E+DmLD?EAp@`is5S4)&q7hpZn0|q%Tp%iY(Or0>
zjDIRcNmn2$Yv?t?ycCGaHo)ZD03|g@w+7OwKvbr&1?Xmt!kmKB5}>98OjrCq$}y_}
z>|-Qk<UgDPLtVsf7qQy~qH=+#41g}KMFcG^Vz)axlR$b!?6!&4&i-4(ZVMh?3|guo
zeSy9wE3yXiEfAIa1OO}$l?5%kKva&2#1eo=_!J=_0`**}6o|_0BLR9Ac;o_6IR-g}
zX#z}WNjoSIl?z1W0#Ug@R4x#ei9~If9426}Umh%bDnxl2$+$pN7N}_hV@wa-RF@Qp
z$_395n3aX;G><$(MeO#RvD+^fxt>E3`!hlfzq%y$=In75w$phWTLo?u31>(b#Jq+W
zkB<<Z#@Zn1k|KdpK4UqQqT>q(cm-~hVbCgYqYB(8p<EYUA#=t}&w0V-iu|L3&9z{2
z{Ysxbk|6XvLqr`VVcAWuwFvJ>(NNC#DEftLUh&Eu6I5z)JK5?_7A{M;4Zga-b)r*p
z$0b1~+4vHy@mVf6L*hmrJ=7L=BY_Z1kJ)->=+qV8nDDxzs@yVa>p?j6G9+CfzU=$Z
znG8Vg2L3Q<SBA}O&2m9J9e~VH3=@#JEvwym{9P4s#kw(e2OU#4Hk+@vy;IdAik+G-
zBVUuEIy6Alq=qYlbC$nevB;zc%4K^z0Dqk(!xM&GaSFt*RotaW{A^=L`e40N_n2C$
zvOouS2m&xgsER%mDrZNt_-evzQ0xi4o*Xc83>CpOhJ&?vf`%afOBgP*G}H0$w?VDM
z*RvDP<$YQfX2C`D3n)z$Q`R%{8YdwgL!*Ug`*fO!3oMIShmtBw`IOMUtO)u*&jS~r
zkPfTX=qBT-5R_+>Mp8ay+a)!UzE(zPePX=JQ_h(b16aawCF2R3KSM`l-LwZx%r*3P
z$&fZE1lDdmxw1l7O>X3UWNi~BNihZS2Jkp~9T29oHCeiDMcAC8h9liu%!Mm$wJb7T
zDhf9M+4?8DTI?F}eL9gC?<KRGo{$}qhPsNeQqNqFu?j=Bh>Pl7NQU$yD22(`sJPyQ
zrg3nJ*H2H(<P7hM^)MjCFI%OUM`Zj+_m~|rL)JSE$Kc#ft2GY$_3PI`O^Jfa$<(VV
zNqD;QtV-fC43=fx$Bc8|13k-LM1}^WFqe7RvdW6fY-z_*#9w9CYjhpShz{!;E8CmP
zjQ5$m)N<{xeg`yqj+BmK6g%8&7o)i4Ub~poG<@x{K&IYshAl@3Uqcl4$d5qa{m>63
z72F=&j(?=0vn-h%)Gx^hBWYdfFg<?p<`1N8Zm+M1Hj}YTX6UPqWlZFzhZ!T(3H%ZI
z1r4~RK9!l}j<j~y8x8EXlNnFzW!W_+Y1RuCZYOerP_1MI3wOc7U9fO-sVi8x3l?rz
z9R&+FfsHbIwuH_0OU=T)anS}|wpyg9SEkZ|m$=Jey~w8XB*0#Xi!1DP#Q|-#`;HC+
z<6in0Kf=U?w&}U;nTB|JhkV7_3&ZhH(u?SaPAqBcL=>fA-v$di)E8LbW#0gHE@R0l
z`%T#8ugIQ3E%|YjE4`U~b_hEQyEvQ^plPJ$oo-jE<{sGQ!w_bs>|)aPAhRdK2na{u
zcW-CR{?t_f5W$~--(#ar_9AA#M#KL}-yuOY`Q&ZkwKu%iVw;-u#8e?oMROiv?+;%9
zo>b9g=rB40z@|c0aSjP`*8<x*a!x}uS?w~E>bE=Sx42}}1jW*g4razq8?czw#9hJd
zkaU&`4h}87FC(w9@gp3&wt9)CT}QB+3mzo+AykzbP&N(3OX|`IDJ7fl^3ueXlyvn9
zac$lrt~W)v-uJ+@uGyE_o8)eTuuzW7C~GOfU685JS6&H-1N{NMH_(oh)K(V1#XD3Z
z!Hu)_t}`Q>(jq?X1WFo9=I6#Zat9bL<qqmOQN!-oBN(uHa}069wYWn1s`2XFap<VQ
z?q7k}huY_aE>JQaaaM6}4)STg?P6>?%9*7dSUM&|0eZo4H2r?%mypup-PL1sUEq8?
zb^#XF=mmC1c}is3Bg#jWrra{w+hpI0UY8n+ABFpT@MFzrV(P`(H>S#sYmw~nB5Vr>
zmN&+3FrfD(8B7)~zv@b%iO@IYr=u$Fb_NhcdLBmH>8ECT@_ZSnXi`RCyF%{$%H>!t
z>Q&8jhbmh65Y#;CE-o&rPrpdl`ZLuAJ;tK&EbAAOGV$4BBDWK_mRDD>-$aJq<TF++
z(>+T&)wS+tKVToGWNMagM^{u2#^$w+wau07?X{KZnr^IaZ>+CuB(F}_4Yk_5=yg;2
z)128ttYckYWHdNBH%cdSK%!%>-$hP%@yk=CfJzGEej58=;ZU{UT%#;RK6C>epEMv-
zC+kYj=<YX^y<9w+snz3k(CA;*9Wn`PJn0V6)2Fb5zbtleY>t51Xv$gtQ<2vXVLtkY
zFmYg-(lLbdCff>Z?}(ai3ro;rA6~*FZ&}|X60GuI{4vB{R#VIeMw`H~)Pfn;b+M{)
zc1#=+aLTA70!yC`&>*92lJzmGRT7W8UK<84%og4wRl~*AO<Hy9YgK@7M*In@Ii2HV
zaT>Ts4y>2-qcm`Vn+dGXp&y(=&|5UyajVaqNi@oVZRm`QA;`Dv?uvsI7VQvLCf^N9
z#KviuHa5}Bz!}()d(4-?QuhONSA3=xjG<;&p``CXKAt*tiJK)s@D_aBurWLUe-5Zv
z@L<pPI~X0j(uFC9)`V>vM-J+CPEy?$O8YFy%O^K*iEsaxm!D%D9H^C9>;!uEdD=!>
zuKx6(q+Iz2rR9T?a>vU@<x-cADYy7R=~=cgwy?gwrYsO?cztt&zJH!)VTG$#Yx2^h
zj~lBTgFS4xFj#>b!lF{gB33yZ`ZTOMODHZ3u}KS#oe-iLoj8sjD-Ifamf}K7nR06R
z+>mhm)se{6l@;t`oDoV;!XEN*Ima<W;12Dcv$JE>WK-m}YwoZ&RZ&)=p(;@wc|BG;
zb$Z3!&_3P?#`j~sxd${@80e4+k&)P1ee(8=YMfn@iSy!0v(p~HyxnR3dA#6N#m>Q|
zs5->1(QRYb(d+T+6SVurJ$!M3rW#I2oOziQx<hwhb6*zs+_J?v#1J!eD5ep{a3Teh
zb6}xj(v>h^$0K~Q>~<Tff=8@0A|p(is&xl*7!#6bXpv20p9jQjx>_|^MtZfT#gFm8
z;4G@sGn}J#TBZrjMNCMdP?%wm9aSS5qw&y04K??<Vdyx1x2xiKWS6p!OR`VwcD06(
zs;dPYn|fHnLtFM!)Q{ACbJ-0Ohx#1hyA(9pvbP<1rSl1af*r9dAnj^n389TnczAq_
znIBeh(j8en=o$4h2Ns8hrlws;wOGorZ9*xg9a2$t&w<Bn1%}Y;L4Bw~bq5c9oK<70
zprtN?1>Sx50aRh%#l|TdkK`1-+3I5V7(w0c_M>PN?$l~?n;kC^qg)^?+Yr;zbs|0c
z10K~6M>yaVplQLNhUlL{>kJ{qaKZGPM$5tS>-s@`_p$t-fv#`p%T;=#yv~kQQXfj!
z6(sa@@xXzOajf|((W{>l8TuBMa8HgC%kq?m^TK-RXac?#cXyny9k?T!5cSey#||Jr
z+&sFTY5=`nn8vn6CTObFDjrI9lJ#GdJ{~*4X$c7#c<n<XW~bgQH#QVRHvLClZ}OGY
zlTv~#lJWbiIQ$oD#I`)19b;h~*j{gp-h&?TaO8|-71e}O=)?~?A#0^~p=#UCmPuHM
zLIM8_JKkq6K6_C<6mZ$UBy{Uv;m@-d<(B|nee3XVhynZErN3ZN{ojhqBXI5^|Mahn
z``>@i|G{75O>?2?vOt9&n>_d;zV;Gzn*;6l>-&E05sBZ9f!rx}`)2E<T!mb;72{Rz
z+p)^E0(P%h8#@aNt5xf+a)xmYiM_FOXwljNL1FHjw2(Oca$nagOlZ%B1sP}Lz(jrJ
zz7gM<jGk*QV9#_D8!GIY?A%xXyOuuEP_|ecq5@f`;h+2Z|6US!$SlvnAon%?hCQ?y
z@|OFG{^`7Nfz5^W+&AWjRapM&OpfPy71^at9q3^>i}JtxmSl?~9wM2wdakOOfWS&~
zR+#%i*?pPQ%;v&I)#CAwJ~_*M>$Y&*n;79Voki&E*1*AtCF`_V1>?iH+_&s)IWWEO
z_%QeF_#U<?(+8Z*g{xKTk$4xK+&AHha%jY-A`_<}a+03=M*I<Git;RBVsJWWl%w2N
z{O$R0m~!8aRrEP@F?av)IbvDN!eXq|t||W7=EA~9D&;lzO?nT0e1u#ij%#)HlH}^h
z`k77Gfe&cRm<B}lBWvW89?N_AHwSKF_F!?Db?hUC!sMxsJ{Mu@xXtAez;f<~v!s08
zIG|y@&p4u~E-d7}A#d|IZAVTNJnhk!V?GW=$}r7+J06*hqlqBnu};Ez(eY0_9TmkN
z^q~bbOUr@JQWU(TG_u(;6S0EjzRhnl(7~x%d&ml*GZs@00Si+g0J(36APF)~B7G)B
z9b%57a^Hed`k+iy18nQ|ktno8<(I<TH)Mg~0X+n>h1EA(*Eh#kcIO*;<12g5frdhW
zfNizIW3ZXRqs>XKj$sg%;70G@C<C@bgs4|_cA>aFA87{DEGa4vcX&V4MZ@jX34xQG
z_v|vuHgY|l%v2d0-hdCn2^e$i1UOxqyu>3cd9xSQ=V9%-A}p<xvSTEOcF?A-%OHm~
zLycYRMLF-;I>dJ@?(W~d|FCiA?!DXB5e%qJJ#qF=R)bGbzsuY+Ns?|<E#V9&uZTde
z($#j*L$szBg*ZfGAK(nFqyuJ0SlDPPhSsRVqi1f#w1?^D;}4~q54=tnai60OqA%f<
zfi#sfeJnAI>6Aeg0nv3%tiV6PX}wD4<*{BDhjzdRRw0|6ZncjB%S)L8(#L@aYrvT{
zSJzn&^UjHf5t}+h<oW;4-n%rpk!5FssZv$9D%7LWV@f7fNhKGUS%Jt30P%h$!BUV+
zX0n>eq{zsksu?W60XP7UaJZw~9TAMoVVkMVq>Z?kMW|<?9vhokjLqz3Gg`>B(oUP1
z)%=gPZktuNjm`I+d#^tbh!@GsKx8_zii`mKai8a&$9KMiSO^d|5fCxx*_qEUy8+L6
z8&pZJGdX4-l$<aOP<rcG6nD_==lJQvF!Td!={Z8LN~xDsk6@&NZ33f&iEJ^aU=~-`
z(i`y;p7wf=;EgEOSrjwA9-;QvS#r5h{D&``Hq2$nV+e+^h`SaRuOD#d11M!3^%6Y)
zVzKkTEg{VP@A=pNfWJ<}&i?~Ge7N^N;_v?NWJBctGe7yk-v7dXbnN_ZcfS8?@9>Wb
zqRWH-SA4(6zp_De{;un*{%`p4;okqwztdPbpDj-w52CXf>a%<h{me$L<Qh4tZLfZP
zr=NA*U8sF*^H<u)4=fQ|fEfwk1U3?Kdk{5b(vg5I2}{HIT&jU<q=t|>1XENCTeV*u
zY-DuB>FnI^9US(K_STN}I?M6O@Gv^sj2EW9(LV|Ys0P2#^J_b`cra|WmKGP*msV??
z(bK!}-bYj4{t8U$E)lbVUxltwLgExw6WKJJ7dX?a06apg1xgU9T&?S{K{_K5fxs6f
z2{uG36G?46byRU=o3tEyhB}T1#krGpXiet^`JO0eLmPNUY5CMIu0Ml-Y6l_w^fA~$
z3}6@XNe%n}Fj_i*h(dF_U06_nqMF2SIOcM&rs<rLpWuXz0uOLt4Rnu2z>U&2(uj1&
zkBAbcc%J&{LOPV$8M67I&eK7M&t%Ps9FWNb=1HuKsqfKyYM+ZtlP;yMw-|<tHb|$w
z8=zi^H9DBHp|_OGikXVEqf_6Th4gq)(~-lfGp_{tF**!^5wrk7&pLUhQwn^a%~jEP
zOb;?poCDjR{{8+ZRs`IB+kvP}+*e>ZQ|Na%5~hA7Z3qh&r7it!1kCF?CdLp-7{MUB
zz``^VJqM`GsqfiVv)P=P;53?<xnT(on@5V4%aCIH**K7<zHfgzL*)yp$<Sl!g{g7^
zCjMnvxYylOSsgP~xEJn_&V$M7+ySD~E6v61ogf>|na;ywZU)T{Kq!IzP<11AqMU4y
zxvd$dUMWBE)bH%2oFI~xpKxZUU7O(rQ{SgMIexx#obA%(aew<YIBxz}EwV?#G<hN>
zF!giviLARMYLTA>9k`Iymdxc@p^K^S?(H-{lkaLKzB1#0b5jWqwH`}5MncjWn)*I&
zr$$1xG&e(aXhyrD;NupHL8zQ2$bO9%xdvTCUbcgrLV;E3*eK$m+PF?Fx^yVS5R}f#
zq)vSom*LtO^HnPhS=L!S=5f5*PW|%SNWr;LjDCMCjB$Z5fiwDx|NA$;7MO-R$dZLW
z75?Nrq5(PY@F&A8Cai%l2!Ij3y2?JU6Sphk_*ANsQ#Au5$ec4p9A7~JR2=`3Fcu!k
z7YyiSVK!&SXsp+dfQTBE#uwhidrx%BPH+kD__zgkp6@wxJf7~^y^P;9d&<(Kbe>mo
zqR-EG)Y$!rKDF2LSn+XCfp+}~E+K^@I>#xr>nC;>eJ<J6r0ze>kWOA1pg6{vhN>b7
znTTkjb#aCv$QEP`5M|4Upuso;)x;adEv@kfov>u`c7l3q@(^t(j6dWoQz0uIa}CN!
z$X#Dl@$Ad;x_G+qpy}#)#^zCR*9$7Ufa1NGD)}0U8V>R(4u%aCUIpg`et9gR)+$&o
z8bKE{FQE_2ROYvYO@qira1jyt_gUnGd0hdVIbl{!c;q1-ZG-ZMG!aK2lK{CBW=n;w
zCHKOLT~K1-HrF9nMh<6LfUB+vkICyOY2PGp)g;`~xz@;+jj8&}EFw(sUjHXiQT(fa
zWAE?z&)=l>5Pz5NzH;o>8SQ*^XTq^+Jjf_6G=%u^+*WmX#GeGc>&5gwXTiJ2s;__d
zI(@x>fp~ea+gTlcU$m#06KC=S+}O41*Y9nd#E@O94*Y20Og!1O>dOZojRkDxBw{11
z>k4N+`wN`qvpB82(E#*nU{^!vpC|duwd!cK(|xA8@?0VJ<Dg-BQck~{Cjl44$+ha+
zY<>2<U(CwPeG~>@4FsXOo;HGmpvJXF(8HW-)u}bH1B^nMXp)eJ4f~*nY#O6l)?v0(
zd+c}Xs62bE`glk7*4(q%`=&Ru&%rNg`{g-gmXO2%;9=)QS*BJLY?@tWOq7y+tvZY%
zg{G$`OoExA7grTwCt|C;$o<@@gJt|$^>1;0ZEj9FLM;SxCBIgU1Oh>x`Q}=6?*qN(
zatn4I1i|6H6V_(~$RJK)?8b<X&u(jDkc-?qx?UZ%(jBS#^un`2Ra;hKRef}%KN`FA
zT6OgIAvxVnc_$n>s#Ac(9+u~>Qy=<&pS5oO^u{~yyn5GOTU%LMTiB@Bs1#6H*;v`E
zY;3GJmBo$KrOv9;U9s138s5OQd;b-=FnjOfr<ccjKgK_Q@$bL&&O2|x*0{L#K0zNo
zTf=+<oh{XLE}{ipP_KBsllm>3eV0E?E-jzmK;pfD3^`?2MycV-b*>gL*RHnI#ghN$
z^rF)(`PYemUc$V+x(My&Zx<2T*KzVPq45s<z2zjG{pM$%-M@Paj@ma%YA+(RMTGXk
z$=XGPHlJU`I9Cy&Riqu+tcM~(Yj`J;kg=|tl`mC&4iq*U@62;gWJA3$*~KSDI{Jy+
z5xJ-;uhtwC*@qMn+AOp)6HdM^5!y?_X5L1(u3jh6cHn98C5q>Kgh{Nv3(~{f$#~g!
z#j&tE$<NNGp&k&;CD!>7+L<xub@HuoU)Oix5#F8EG%|@(*jy*Loy09X*?i+`^WH1G
zI^$_I&zeKq5)TL(IMIKp2I>?K<{4HT@N5&Ef;8jQca!~Wrg6_Ro;G92<(r#$uXbkx
z8L-rWwL9j)O6|GiLhThYuhVVPj-lL(^jE33MKxc!TMB?)7NA#PK?*EL!pe1>iFT$E
z3z93Kljr^^_;tpCIA03r<E$Ad_;ub9mZ`6lv8g_Wfw&xY`)YGxF2I`gG5H2B$gd;1
z;A!acsqg8}r1><7FU($Z8v1?e`}*+&_L{W&7vqKzYh~(tdQ0uc*Ucc9`u>6DeX?)h
zV$3VZahUopeteR7#bjZe!fSGhhi~e8`ZFNq6v5+Zye6lyhMe#<OnqNJIEl7BC&eXY
z6`A_Z{Rr-k2~b-Tte^V!|9+ODF$JtneY<}=iK1~#K?6p0>ihB23-F$t#VRuO9bC!r
zp2!(y=#7N&-~@1;dfC+XYD+ke0Jv1ai>!}ULuw}=p?Pg*`l+~94E}JU6Ipc+9#a+V
zG1#A}pRqqvGnF{G^)%&>&$2oVZWa+Lm6fKK>D2eIewH6zb@vPm@zi(kUW#&7Q+fF~
z0cUdR`?+bI9?X!X$gd~HGp4?4t46X+h_|HCp;-u*ELM2xyLF4}F%1Zv8V>^Q*<6{$
z2~U0hHZu*Mgl0CKN~2b$pVQy9stT>0!?BbJ#2W^qsqfmyZ!7J5>U)~;=j92A#yFO&
z^BIh$zOTPM?sdM}7GE8L_XpoT1;M*m*|=S~@1vUKs7;2VXHSkI5Gv$p2_pFn-zNM;
zEO%ss-rKhNjyLR%JY*gjnxhLb8xR(Y5^8ay6}T)@*cb+=<s=vi+m8;N(0a<k4xq24
zA5ULosf*xw9|qph-pFk`CyGlIf$nP(=$<_>`;9|4?l%TbY?FHl%Pq%o@E8RJLliTZ
zxE^fFbAL5*Sv=9Y<9Ka%go03^b<get``LF~F733wH0o2S%NsrlMWIa9>j%X708@HO
zJri#_tN6@3%9N=FUf{hgRShjS4o~|Ih5$7MNHWABnJ^;hC}dl64^>`4(%*5q_Q;Ft
zlArVvC77NczdSDYf4%?h{&&Vp;Pij%VgGx%;y3;8_kYm;p(%SK(Rkci!hbiC9EpBO
z7pf^g?9Zj)U)=gb^+kH+_w<$D&Ey+ReCY~an$7ITOaBW0-RS>yhGqf3{MA?W>Q_Jd
zq7)wuozj+7>W=(2ncqtDR*5TupB<wyDJjBPEt`Am!}99JIh%_Oz0wAj(%p!``l6KC
zj~qP8_bYAR59M^>FzMaOR^gCzN-tktWy5lbdHsKQ_3rY@@<w;5vtHTQT<um?kegT8
za2C6j_EL9Yabs<v?QCou_y6(LpPaKIj{E-vDe&qyGZS(A>buD~d;IEO>VY_Z^=*#B
z@hIKI|Iq)({y*t`_t*V2S88n0XY!*U3f^SZ|MF52=%Vih7v;bE&NoF*npWuX1=NCU
z_EToLvfN~{UUBYa&e>V6*tY=)y+G>{9l5f&u)q?Zjc4Uq5DyGdiOFdwKQLPNFz_8{
z%H`R}25+cnz&6TOlEmwY{^XH-P=;;}o{8R3eJ2<`GT%MXAC`Fp)n`dZHpH#uLIYbE
z)B>lzP(^*NMW&kJJN&kZ*A46!4JkYw)fcFz48c0WgR3!gP@y?KsxMX-=2Wv{pj!rb
zq^JpOsHCi+ci<7=$INY$U&`u>egJj6ep5b|pQ}_vF(-vSTIi#RR*sdEXvf!HAI*Xe
zl@dIL_B>q?ev{LWs_=wY&z@l>(~zb-`ltWnPNsJDm(#p2?Qs5|et*gi=O;)XY4^qc
zI&@l!xYdDQ118~NUlpfmk5D)aj*UAXeo8a(6BxIciz>E(oD*qD>}x<D-3k%yYedp%
zRs_!cN_xxevHYu%6CUwEp3widaSrDg6SEDO(5~b_{-Pw!q9#9~S=9A;RPN*ZXE5!^
zx57(R;ac|q)gdb~f4p!Mdi<^*$HQo=R+9`qB<aL9%90Oi1E=HKl^$GN7=@ZDO3Me-
z#GX(b;Jp14aJ*RmX4D4JOUns+^VYp!1a6`bx>?_LL79czKyXdn>AOzXO4?z4+U??~
z4XvLAj%WRAd+6Cd@8Crrc|ptetnF?W_eaT<Peu`*^}sck-E)20Z@aiMd6s?P;*p<v
zE-HAL%RUudg2eUavOE0{lH^YC!d&tw3_9=}CKqh`oe&kdcXvliFh6bwpR1oXt=mC#
z7=&@(+_>Yk`+ndBy`$vbpE*95cF~H5&LNzTcE?=%z`<B1m)*rGw4OSBUkwzyZM31}
znx`(d;T;r~Y1<)6Fn)}Oar@Dy7;O)S)<ZiOxEL*XhSG0MUw#(!ecaM>Faf3skDUX@
zM}<u@5ZfJlV0{uqWL{05Ku4<GgBoXjh>9ZChrRRxk3x)WJL&Aa^(gRM{QilHat@w_
zcgJxuKPNBN!@TV|FRUl+KKwVIo2GFUL5<7I5*N+SAF;#R(CUt)K>1EEu(mxH^+40+
z-m}{dTul4cLl*!H8><_u&x_Fg^cuaQZjU0@NAX=tK?1h>1B~dlSGjBVJ@}OHHonx1
z^GA-qeIL}kO5Smu0zLc)53onL7K<1Cbt1Pz$;ejTw;qF;ON43mG55CbKQ>R{)`?I6
zkb82Wy0N&qu(`Q}&qDjKp?5gG@kgHbavuJKWc$A19(&_0Kfvr%K2P@eTNVNTof=^|
zR{nhCfI0fHeN?%P%`P9wUmpjd`#I8#<rfUK{QGmP+pdc$r1IM%8w-&~lezY35blRT
zFo@L0oD@=nKfoNBd}UP7!x=UhjC?mfvYzyVp}JO{quQdLja`r3n|u7&q!>4U*NDG%
zPW+`u=RD3;>=y(b;N5k5kYXMfk=1P-lAk3j8oO_4VPOR|MSSbib{s&y;)5laL&+tf
zgM$MLUMjIl^Tegv^-#6d!h^Xo-8cf-wum@Y6cqdY0P7dOv_tHExu9zgTr>lh+ry!{
z%toWy^g;V@w;niNFy#J>0+G4S3&>U1@%b3ogQ7nSF#4eOiX1IES5VdOfcLbVqX477
zbk~YdAre(pVb2}fAw1oK0j|ffg2ILA^E`JSQJGP1fUSj|apDf#5Cs4|=-sx*sX{Lq
zw6R(v7qyM)PWPP`D0_SWHcHxp^r7pP#)*)wsu|9$uAYaWj}w{Cz>q=~Y9qTc8SYO;
z!y#4y+Jkz^e$1=b&4mSflasP4izC0m*SWtpmzFQG!yQMlYNjXTohKs~wKR8N!E#z;
za?57o=d5RZ>GKKacn50=ixnR7Bfld@`Tak~Y3{@o3{xdJ$SduL&#&N&8dQRAg_Bq5
zYGFFQ!#}n|)Fkg%pZS<8Oc-V$Z79S(j|(hZ%&j%y(mU8@7-MT2>&j}&8Q)C~??5X8
zx{L4}=yTpefnnT+`#8NIEEwox7z<AB#u~xOK$8)Ci#Z+a3RlLy;|wkM7EoLcPvza%
z<3rm&N*=&Rb9bc8=wo4_gqyX)tEGuSqBxvr;?MGm#_0`hnG=UiLu}gbhPDb*!npx$
z6YUSszA)LJRh)LBe59%dd3UhtpoW059Xl5tg^*BfOxHGgsuU`7VDZ^dbF|aOZQ=0Z
zKJ}nQa7&&T)9uH#pFf5~!g23@E~Pf-aiFQ)_P}mMNz3|pJAyEQYK%xM$A=hmAP?~1
zj?0SCln8;DL2OIx<OrUdpv4rN0o+D$@)DUuhx@#v<xZpze)B;xiqH~cI-e*%q$gJ9
z%@S&JdpP!OqcEru*0N8Q2fD$Hlq`gHMRtK&I$R}C4Kcu}G*aTHBg>vr%4tKsU=HT3
zJ{AG`3$X)%mKR}^KIzl|IH}g_JlYH;P_>H7Z(#3Xpg`q0!lJ+mu8zr%wS}dX#mx(e
z51e}#lL#)bjs>Du{QU0Ps)gb9C=I2)HEBWl#V}xK8i!~vz&y@ply|FIl_rc?28V}-
zU>zOB*p4Dk(yl?SKx()?1|+VGhLyF|O3U*rZF@)&1YyJWpmb{pIfIf26*i$l!BtsU
ztSqgJiy4t|kCI8IQ6N*Tx`lN&7~$Z=zo+&MZ<K|PBM%)M-)}Tv$B*sEM#L3EbPsfg
z_7UdB9mxI&;EC{{!K<Uq2#{GEr2*t02i5M`E_7v!O0fq!1%2JVKQ@EQn~N(8tLqn;
z!n_>?JMCToiz=ER;W;1J2KZ8@1M8az>e_i)8m3B2`0)`?o1yKopu75+iymh)3pQ|M
zjE24M2Z!i3&%F|iJI}NGRJI}qzhj9(^7NrRsN)@HK4zfOb70}X^9wVm@Amp0{;w*g
zLCo3%9;6h}n5t;VV@}0^%{H%EH*VZe2jAVm$5Sxzi!e6Sn%)WM)*TF!y?oz?&M<;p
zx9-6Lg>~iZV-<8ch2veuLd$6HABwgli;u#sBk$_)UC@ct{t42{E2t0+<%%tW>XB~(
z_PK5XqN(U}=Io)bgohXe9rhqjQ#XKa<-<y}f)UJgtXOpasndalq81{xCsll&=NCG^
z2QgtCO1Et37B#y`YZ|)}6>w0YS6WBwgY`%-2o<~X!0t(99~v&OVIZk!p$`$tq#@J7
zW<c|xc|lpmW<{gAC=?Q6rL*CbQD4oZRMe#41XAi9;5i;oNy*{lqLlT&Wf||q*q*=^
zg{uNSiSWqoz%2A@kUGn?h0WU1n%XD`B}Pi=Ne!C{gw}pkvE2%U2qcyrR|cbKzcLs=
zHR95h&!UQt11+rdfcEKC;-I2deeI`FzrMD<xUjP3tS>JtujfU<+R7rF39FMO!Sk0z
zkmcG0SuW$Z%b{MzT&)n{JfRGQ7Tl>Ro&e|`<9=i?N}_0zLMKO0SuSj-uo+bcpV6$`
zyQs&?6S$qVTLQ1bWl89QL9rA`BeA?|g`}AfKKONXJB3GnIiZ15A{S+Af<KlT_}JNV
zeM1+unJEihBsJQsc65MX1P(Gfc+@_C{^7<akd^Q^e)CFPY9=xQ5--3&cEfrc9HL$v
zOd&rmmmZD=^gBTf2=TF)tSZqBej}x>@wr<#?7#sQ;V5*#XocGDQWhkAjQ*X^p;E+q
z2)^Zgp5ucMAF3=1iA3b@GPhgAp<U=1nfo)hcPJs#`|skV{KPcE^h37;&rmnaz4ho0
zQ@@eS1XXkhyrUB`s5JHwqfaKvQ=+*6l@R*4|163TO$nca?-L#kr$w(uc@WU<aN1Pa
z=pV4)#fs;)NZSY)hGRj^z~tN<!0G2Xw-lRWH-z`Rbr{-13^icSNm{uJvbs;?%7;NL
z(VGv$FhCo@&Z?du^Uy)GEV+9Y4Xf#vm4LOl)ULW8<3w_3#nWcsauBU9SJ-eI^loX!
z(`0k=Hf(#jRq-Lf10l*Qw8R5wxAI`SEp9`3zOa|jF=1W^_1W>d`I5~L*nomyJVO{i
zI65IlMo7J~;bK*9p_DOiuq^?;BnwI@r^aGL6DW>Q4^Nr}&nHZn<n2}IQ6j=lB{5Ap
z4A+_NE*?n}m6om@VbaekFM0Of-<^d5qH-Rqc)HNn-GHe>7`qxK7IIuwTR*dPjdZ}4
zf#a+rw4T}fxE#;EEw)|n#BCF}KqtS{49bZoh+L6f&rR;wa2Kduo@ue(<oRNaQZTef
zHxB2F=5NQb`HVxeMw8L!E~Vwq0C-VDv3FF!w`OlCyqSQ=ARJH@ega62-x3=4j2eAL
znU}kG($#a8#rOen%9)|I0_xBQetlLs<>~M4ICxXeo{-PpBu#)1ph#3sP(J@crUan2
zICFu+)xMx0Jbv{r5dHP{qvLU?{;Pip*G1^IqvMCKz9U}a;~a4R)r@>7md|nj2d}<^
zP;YD>=a58uH*TZS$G2lRW{>w)@N)*#FTX30@2lTH+#Yk);=6t`vyYEcu>W2Sx9=_B
zFLh6X7T)_w@>#z4!Tb39p?sTbw>+r7B9F>t0I0@(SC#+|^5eZbctZcitAB;L1Jn|s
z_;?f@XVL@q?&1QuO0{WkO<uXqzti@8H4T?$$X~BMc!SC)$I_=vqQNEl$bNtMGnDD*
z`=Yy^tiG5(Nu}u8wd&U_bU66&!`Pa|A?i#b97d;~p^h3C;5C`7A4pdwawA`>e*EZ}
znw01HT7DAIffjhJ`m&JL9ZV|LC0gi==Hkef@w-;NTg*~nbHAt+2+Oj{*#aF$(pOE1
zeXY8;G*)sjQ@Q(;GJW&cs(<UbGJPi&V^zv7B1lO5K#wN9ww|t4Cl{*-SORFpREi4}
z=mY%dT6J$(&xtNNm?hrO9~nxEZ8eS8t%Eo69n6q$kdnj{>bpM2bS@tdIeO588*b<L
z)w^r0wf5p-XQQ&d+F7lvw3b&Yt<CjT#c@`bmfP$0#^Txr42#5oI+mh+ufCh0xRDh6
z>f5wij{oBO-+Hs^ee^6`Nb9?x9(B5AV$O`oXZ>YQ&r6?2mzVIbDKjrEh4Rb(Li;*@
z%jBJsDW3NqX-)hm{`H^nmo~6uBc%zUXBa1$Bu!;HzFq6&m&$(3#=XRT6sz_OQ#WHG
zzD^5~Qri2iBJ_j23ZM~yJ+A^B(FnQ?@)p1;llhH4{KS-x>f-3R18H`3we!w)|K>dI
z5_c&=KMaAZ%uvAjCEqpABg6TH>q{R}_1U*n4Y3IQeDbJq@BV`iZ<UJB4<Zl@Gnps{
zQPV}}htX(;XdUDbxGkp${RjpX5qZG!kgUlhE{EX-ktEkDCzA^L1)}4LRu6EL4rY`l
zc%66sB(fcoG}Ciic{=h;aW*RT6Ce?(kA_iEQ!VXW|ELsX(+;1>Ou9<x$4N_&LPAVy
z+<0fH_PhQGpF_Uv+J$^gW&P@eNHw!`RNXPa9D&v@<Dba=+^Lrr7sg7eooL3XK9NcK
zy?Tixww2Fd%~Y4=HGK8iNz*-1@zn*Xj<0@K=5VPv)-Q$VC>TGk3FF7tGGwf*+IMM(
zzlXoZqr`h>Mp=LG=2r5T9Y{UHf`9dGlDMWA)=b1YW5$m+@a_5;be#IER=z6`10)#$
zd!~U+q%Q>Yj7P2p*iC&eYQ~W-xeiEwP5k+ix+aR1N(p>Xv0q$+X{iE|0fb@1bUOiX
zC8`C{Tu0K0%q3GlLGMc^w91jrj+zr$X|e*DjyUyw`2)1S?Tvt;p~jAkA0S`t7N`oQ
zzU5N+L1yBG;D3)S`vs#%%(SyDDqMsG4Qym>=Sofyd~+^H<trCm8-N8y<OxBW0m<xC
zXJ#HDD;8PnejxZvU{Z+zG;mVvC=b9}Yj*1A=|&>H#B5P1VBGi`1xo<z)OTW?^MdgQ
zF@yGbAoCdeo5^$l=L!f7Df=TP#3lG)Tsrj~yP;6dl0J=Y5bSPn)nXQMJ57Bj*3n9p
zv7YZivhrpHPMw=SWQ9(@jN3-ydKbJsf~Zd?2U0F83+#^MU~*5Xh1dp^p853NQ6{hF
zDs;B1iw%9hJY}&VkoF9a^BM#EO_+Ry+*!Dk+mIN*ZA<1hGw*f+(4NI3->HJ}2TUTk
zNRzfBZ7$+vk#%1wxOTKJ^di@W@obiv+8~VpLPe$i!7Z5eo|)9b*<=v%1N1iXUnaG|
z1Ua7V0GtbV8kiH}`=I+Y2y%RimJ+#JKXJPR))Y++G7A)b7`YF(Km6$aL)shTT~^ma
z7Y5B#RNeJw8Pz%ewFvs)25S+IzN^;^un1dAtE=thU0=6(%fj|2STd6Jgp6}K66dWg
za%gWg|1Ib*^xER*5i*UdU;~O9`nRS@f06VOXiz}_9uqgNGOHP@@MzZu?@o+#9WOwf
zNgj@eAoUx>8;AB0)5cX!35XHeZuiJ)HY1#vZKv6s$IpCCgSjXXf2Wp)@4?R+H=Du%
zf;X6F?)t)@18s=OTuHN~As6ip2#wIN4tO50X%Ou^8gCvmEwbYUt}C8?XpL%H?z<>H
z^3ac}^(j;U_5@A#Jf@ye>NcAt<d}s{Zy1+UZw5Bl&!S-0M<W<CF+-yG2q4g@KEwgi
zy}`3Gk6iok;?u++pRT66>w6>_6KbL;kffVxRSp3#UIgv^tu1SK6!!yob~EW#>4=(z
zVA8I#>tW&z$h?AWPzA}tz+N_Ysoxa)E!?xAd704R@&ovU$FC`uSU9mpdV|UXKCQBx
zt(Q?VBviHJAys*I3BF?1`2j0S9s4a^zGd|}ze{eZCqK=n->NlPFYO^{Udnu)uFraj
zKi39FvNfvX7s{fogR&gyHrWT{7K~a<coL`pgC5AuGjGaWC@ZU;t{;7!xWvcj^2i=|
zW^iu+AIAL-pSbE<(pM}4j)G-jMGp`whB#|%Wwudn4)s)KCr{l>wS3EZy-6}{)XVF<
z2ANfu6Gx^KYPz;8&Vu<YovJO_jJc^Y7s)iC-<&4%S#IRj6yXVaiY8tp)3b%`rnWbG
zhFPIawxzrE3mq37LjR|{q8!_IjzGx>J2dRc0WSis$%H*4ZLOPzBcea*^`H@u<O(T}
zHRuFU&5TIxV5zzUw&|MV*M`WkB)T(nL?LWDH9ik_vaUxz=tMWnUDfQ1ZZ#Wd6|5IZ
z4agbQ%e(3Xu-b}fOhg_;(UDRuATU%$;($I4*f;ne5(Zg}hDr*S708k@c|7Wwk*2}X
zvW{8~g(F)9`3FI-OusGas9Rd}vy_Nh#GptKEr<*u>UsJ?<{&jW?)STi)~n7LNOxHJ
zDzgdaGm=x0BA}~K2P4p+M2NXT@T-|Y$r}y>o0M#o>aPC;{tuAKI=f&J$8qO_69~L}
zXuC+IWyO|~+@Qw?Cj=OyIHDL2$a-Oq!H>H>*i(3Xj5P`$;^~3pL)jiq5sH1Lu|k+y
zX52n;NonHkg3V`3<|TJ#>s>og7gf^@Th;gN1p5w1vYfC95rWym&`>@m17o#tzN!Kx
zR{8uF=%?PnBX;!AHs>&My2pt8q4zL%Bdkeu&zKy%0-bY1tBFitEEE8dWMZRq#a~qI
zDQUH52|F#T_AIUdZ<$ehupt{>5K_AUg%_0PhPbeB*REeb<Ik>tkMq5gneQdFMjr%^
zx6iv5sn6URJ=Dg*`k~7ibZW@#q-7?owN{glNxN-_FWiGFxXeIm8`YK;7uKo^n`;}Z
z*rCve((P1nNSX!~!&ug%UXDU<g-SdQW|u3B5kz^H`Il-2r#(5{WwP{$;Q6wim(-$5
zdx;&O_m%C@?*!QeEnVf8O7qc0zV6Uw@@>GvQi^#8yfgeg9D!NZ#}OpPHJO;?E;AE3
z1ujih-UCAt@7{ti+fu`%@8HhE!f}xgpVj?t%KWM?)a>4BqUj=x>FR@`v&3I{c40a2
zqYB7&1|a<k`Q=uCDl!l>Tg?GUGs8+Nz!x7ZI)e|IsvZ`nO=b~Zw-8TK&$6ud)z{gv
z;D|$-MeqQF{L8@;v9m_PGne)vy?@&mzdcxh4`~v}laiBJ@f8>qH@)d&yh9Byr;h6=
z8B&#JolNtIdlXF^?MgMxsS82BnmU6Px1b?V_n$pzV7CR68V&^{{X1sZKr}0<mcnXe
zp2GpV4M9rcP8ZB>LinVnIcf1tQWJ7G%uJ?T_=N7-2(kWjb&$=BvL?M<e{$B{oJ~;S
z(ZoK4j?6@Pk^FkWQQ=&wtlu`f!6A)Obkkhi5N$Pm{;qFIrf~#+y6ac6f7o@khLbb>
zjIGYI6gw4#xQ3uysI*{+b+Cwdf>X2t9>CfZq+>Wm4p64!<b&lw&Bi8$A`FoV2v_$Z
z-(_KGDmWZW$XcaGGRXLmJo^YHB3GgflSr7!AT^<`>_f6Zbo*8pjG3Q{Ba;NC@)gdc
z5TwnnoQz2eLAxbO3SAQJ2c;O`#07JgM+)2+77&{0M?5Bz6HjO$6y0o_9&YR<JVD*e
zw(?x(F`abFbOa9rZ4f;tXuVtk@rFf)({>N+XW|`!)DSV8eXVpET_CYS1*bK(2Cl7g
z;xfvdT7^m)QW+TeW)%az)OX0+IT8W_G&xbiQl8l+4p)qiC^2ZB%tx{y9KU+#?nA0U
z*_0!IzttXN&KhL~DDALedp*+T&fz#vEuq-$xDkx*2(4w8p+V}2#sG)G3y|TdPM`Ch
z2l}=~?l!zghTt}Me$_ES;VT<fX0;5~xd8OCnkgc4jU4?{UeZG=n#1bhuH!L82?R1a
zP8PUVwS<JY#PdO92d-~UGPZdOI~!9W47%hxXE!PJ`_8WKCk!v?a#M{1t!sy;BUT?O
zw(-zoyYX7;R|By(r4s`z6h9sf&@eHOn`9P$nx4l)cr?foa&iC5@|YsB+*({~XB3ep
z<q5b=44E>F|LD*j%5-w=U>BUC+To_CI&`{ml|%nPRzwr0FuK=l;#6;Rpx;PcIXEHo
zx^474@Fd$I%v36cZc7v%S<*Y7-u+Zxbys}{o=niTV6|r?m%aiYrjP1IouNDjpWXwz
z9r>qmQerRDWUwAVIDz=pvL+<gQi6_@?;+&}t7+mPb)bTpf$#cUNEChA-l8^dby
z!P!X?xi5Q0Od~8Ah`buxS?$LIPxJt^;zTc~HC|NHoYk7<%Gd;tYnlsd?}^|<M`Tqh
zy*?4dW@fVHq0z1&P!tI*hE$*65OK26w7$dr)`33+F0zi+j9?`=lT;t9TJAM*xCOp3
zab(|60}4R~iDCm_;i1@4r9ElSQ(P;eU9l!$^~kv=3Lac}iUTt_RdV|dJqGDajN=(z
z2N*L-L_sF0{i>%>20#{tQpmFkn@orFHzP@O*R!hL*y);CubCe?wfAuPVS$RHk5Xrd
z9fqTX!pV5nq-XkdG>BBU#}z1?4C?&Wtx2RQD69LUmZCE~l644oly;)l;9zUjRyNj_
z);f!u9jNmg-OZ);`pV|Y;$pYeS?w-#yP#PG!K1yjT2qgXV6$iJ6d*vA5YDJQJl+%S
z1=O6NtTAc{4^*xh(Oz<sm)fT=@|0(iGEwVYeLycTPmmEx#-66=VD^5YB{<&WQ@58F
zrVr#9&0`7j8$8rJ)K6%qSQ4s%RIfx1LxHe9dh}UjmE~a848=cxy#0ynrI?(_=;h+t
zN4H59CCi6Cg2aUh9SJT0wc=>rNy91?-^WFfwFr?0TCkvvN?T!rpIJ*#9Wuk5l{UCo
z`5Tl+a;dg{Tb8PRZ+7QqCWZ9G!ak(@2Xug&a~^XQo;fAjg~Anz9RwOzy?0O`gH8ZU
z7`@b-E#(+V_pB4bBW5<Ra-G+o!7C}NxRLpad;2*iMLPg8U)6E>q|Y0?1zr_O3mi&x
z4G2RvL<yb_=RF(5#ikpjN2y8=B;557QB~^+&USXbR8s3qjVlt6ut#vFEA_cq(xO4V
zizguLC5(dzI$@X18n6UGpben?#WZuJsn(!xdfRDl5o0H^P_h$d7O%Mn&6kv7E;Hjy
z$}N;8nju`!=qyx^vJD3ZZc0@TS+>b;v=1PgdHO)3;J~qc<(9ffD>)bgI&rX8M8lTF
z!SfS!1srMe4xSGzR2zyQUTXG}7@pj2<F=mKK&JC_4Z%-{!8n3uCGr}f;1=W_G>ri&
zO&BXZV*=V>&yDfw2Tsh}DZ%j;3^1Vi*d~Y`K;UF`YCTe??x3cg0y6`-t(tnML6IyP
z;<k2y!qT*q{KX#U=;gfA@;0%$mKy6Vb%A^aZQbXBr-Fp)LA$fRYEawXMMQ3Q_CWs(
z8ah5vZlK=1Wo=|WvAtp6-hzDLXVt2-`zccsaGr^SjOym)0&%NjdcwFOC|;;tU&pGF
z#j8RXCVXOC?;9;>dgMjE3)L^{zN9zjyhl*Z$+#^ylINVz7|)TaiA>GUIn`<pny16E
z9Ak2GxwfI?W+Q*q?k)!z^W}KeoZ31`uBVGdZkV(#P7YnJ!)lhzi$Eg<@>sq}IdwUn
ziU}XqGBpq7%+2UEqL7OZN`*Gv&$%l_Nx7M5*Bq5whzde#k)9fUB)V{oup!D1oVK)M
zKIei$Np;$^(xc>z7W)HAjuMo~lYhlWHz|FUO44ka2WeHG2Q9pEupr1Vg@>XId2{Md
zCk<?RV17wcOZ)yvsd#bF7<fZzFBsU<*_XAH-M#f^CAD`;^QC>+xcEWs;L`Jxw{joX
z7uQaHA=|mWyp;RKEP<2lFeBRFw8+<!M(k}Z#S;T#)?^}|vO@xamW+v&42qSEidx<>
zl1i*88KxzjmTSv!eDdjL2(oX?b>4iT`)^K8x%)E~N0ZgtN7$b#HKZ#lONf67wjXy=
zs;MZ&fT5%D5JhUo@)l#qD*0Vl#d$+n&%^fgTE!neyi48bd=LvN-514pq4y@K_RvvM
zy2UzK%mD{ELg9`9qO}q;YqT6%LP1!+`9!|Z3Ej8#$5n_D#Lke)uYP!2Y>p=&Owgoq
zY>k=^o<}3_lZFSLcD^mKT*^^m&eCJ_%c6y^qRyZgO>k7R@G|^_sF~0ku>uH_38Yj6
znyI7hLVbE6^2cv*CePmA{l+`*q>PNy35Ne*c0jcU!AR&oQc;aI*a;w{in8t)>qr!@
z=?5$S)D)C~J2Qj+WDg-V$PwM4W802lt%?1poInQu`O~XKV_zMj_t9K|=tY1;9~0#;
z@KBu|Dj59xFg_rV;d@|Yt3HdLGt99#U%o7om`P`efjmW-Q=vfg4l!3m5Myc-khnnf
z3dsE)j_(4|3x`61=q(Vv*8Kv}3nwoDqXnY3K=c-fUK7wO5WOmxC~=H40KN4B(ffEC
z@hyWnDG<E{qW2USbb;t)=ohe42Kx#`ufZG_h+ZIy==x@et63m=>6I)Hy#=B-AG!SH
zFQ-3tSs;1~MDMo79;R^e0?{icTY=~;5WNb&S0H)|L~nuUMOoki(OV#Tm90}CdJ9A^
z(ew&dTOfK(W@>@x9k+tVf1iG<_!5ZTd6bL;eU8IKxAQ?qrTcGy*?U7oT<I;~omgJ!
zGo(c2dkRc_m_=6YH~?yZet;=!fF|N=@gJK@Ons9A41G=N6ifoJeWSq8f4lO>zxEFQ
zygV*{gHh41zV$`vYH_s}sBW+j9}S(-mQ^C3C#5pvUWx2z{A@f&gWpYkkKRvn2+)Tw
zO2Pok-e~qCBVUnxF!g=;Z&L2w(#w}q(cHhB`%9FXE%*ObvbpcDcnSaN|9$p578rf?
z?#g0!v9qw+s<gXpyRy<*U8&d$NW)*=ut8I{u(;mdXdO!((pSGdzTA$bG%4!Zen;l;
z_|+esoWgW~j*H@^3rQ7%G#TS3$!92T`d@Ymxch}P&jRjlfd2)7wc`wg?F>X41%Wk|
zMM4JNC<v?-t-j>t8wQ4gz`7u?2KTb<jkAo?k;9yNLeUXM*US`SoFyqYg}yA$r>~x1
zfm?+f_yvJ=L13+6xnyk<ZcHJ%EC{R%0&8*?3d@I~sw?2`LR<qEs~Hh8V=3M{&8$Ph
zBw}|9xO+ih&7&H7=nU5H0`6YG-3tQig1{Qb3K`y*Z>`Re)B{}*SZ`Scf%SMuU;RG)
zf&BvRUclW8xO+ihZCD+t5*Gy4Na8C9tW!Gqg1}k|P80;z1>C(Lu>OTk?JeN$1>C)W
zyVKuHdj5jI`s-BIsero&T@+*q{e+mBRD)6d*;|FXUoKN%3h`QJVHIazg4-NL;)1|~
zj|JtlBPX9D5aSyr0AOGnZl^wb^Rv(H-@QemPkhlJKDRyzW<LF(EmMAXaV#}*yeL``
zy)-N-#49JL5GIHe(aWOzZc%=>D8E~j-wnZLUzFd?R_!gy?_Re2E-Sj;)kPDaYHf`x
zBC2o~j@>L$2zBQ&bwelo$k^UfqKT4|JE<tDs}V{H`1r%6Vw;TugQ4rhM=X<*3VP^v
znG)+jDCLXqnW}p_ixgL@(wzBrv*Jq9Bvl}e1qb<F))Lm(r<2t(hex;oeIov}i!wCn
zLvrQkmU7@{R^6Ld1@tmi`1pt$*6oq&bxhmL>uvu?jfg5I&MR3MM%AXY>bklno?1qc
zVD%i9J_MtlJ@Qa+P**P$u04UiC9g#f8J)M|Hm0PHGKK0I0e|V*fHy4E@7o)pXsA@c
z#f%G6pM!$QF7DD5&QTLl>LE%wH_oA|NRH`XuQ=rxVJS(P7fi|Yw^0j=uje41S*UAX
zN<o$|EvP}ON}D}5uW^JwPn>M%v{Xi)P&o~ZgL$F>^(<A+4E@V5!2s-pDBdVlqE&Bn
zmvL8w0wP?EBrwePOIjpjt)LH>q@<W0=f5n!X6ut>O17m%$z(pE^P5N$8(^VQkB9c4
zh$)j3x0>mb8+^2NGg-;`MOaNQ<aI2wC#64S4d4w{A8Q^J?%A6>8>5=)6uu$#`LdOm
z*(#PORW<io4r-_0)XT1sT!%WSp*@K3UXniQ4cTFpS67u7)jJo}bETxYEEhGpSQ#>o
zr~%Cqo2odU)YbHkFw`eECiaIeV|&AT6iPW+^h#BCk@;hN&is^#48TRBXoTWNS+T}Z
zzjf;tO2*(p<#g*+)k=7_@Vv^I#AG23h#lvCjCwlTF$;tstE#Lft*i?2&23q#Qd`;e
z8(oK`mYk@*vaq=}PsGspM=jTm>i4js9eKKCl~p;vH-3yN2KdI0$!x<ne%3IE8qT2Q
zNQK@Ak9!=%sJQ+ph+K}N?R}VhNkL~>vO6eW(ix860+8jU=TA-!5lXuLc785pN65;0
z-MLIE@9AmA40Xa_h;cy&?y8^4!}>yXVIB)?=-F*2H=ovJ%b_lxb1)ZW(~Gj{McMSC
zY<f{Py(pVrlue(Y8m8rkzt&~bSI*HBl&OV$Nv-Pc`pQzt8l8tBY#}F%Ea>a1Xt&kw
zJMH~pK#f}GJVP~uanp0#6BY5Sg?z)>j-t_kri&DVPc>;+BB};u-$qV3l<;Bbh~0qP
zJ)_AvZm%Cl^Hkt&h|Qpk*Nz)H?btg)8*E7ZzB7QH1uIU<If~|YvR|2&`%arRVndn{
zy4JJS2k|&$a@#Sgvb*h^+MoFp(JJ^8@O`4xi7jI6HQGf_`wlCL!-m8Y*>|mpDmR0M
z%w-?aRCVbixE|mOz@2Uv#W|T%LPx;BMHxao1yh0=ccIOStP#4*=36p$d=JJgd9vw(
zDyxnGc3#lFV(H7oQlvQ~gN4C#hv4Pcb{Pv%8$ZH<Ypa)NyYs;ITx8?I522dWfce-!
zXkS0Q0V_{3?aFgwCTSY?zvv@wHDz&q>bK!%Q0209FS%H_j&zEF9$RrMh#x?_5?BuO
z2LwKriS#mK%Hq4kLNzP6ahmOlJ$2I7%r5bhPN1~2L#G=zMzQN*x(34_jIEv%H|&l*
zjGa&$$5>AIENPJb)OdC7Ids&(`d0;kQ9VI?P8b5U#?>N0-FlR&4B9T{M)|O$)99RD
z;K~k%<LKo3m0v=hmRMI$Fm$@qZ`A08c1L+iWZQ>!(r`M=@|TI-O1?R^So|oo^O2=J
zp^KRpYu}h^Wzve&#*1MVdX_(eH5k(Sl1?TcF5l`#p^K0=<!8exu68^uM0y^EwDdEU
zUwVHT;b(b9Xd_d-<vFkY9Lq(!s=47%k5)bez62h_QmLeV`Xbxvm#PnXjz!>E))(Vz
z;$<maw_}%<7Z+e}TJNV9n10l9!!x&4UFyDkk7JlwQ#RI5@-mq(%hC<ild-nGvb46a
zxw*7(vZX7Fn=8u;E9n!mZ9}Wp&U@Vn`DqF)p;?KnABhO$-p>dvs|?xJabnx`DE>mH
zyVZp6&bcXbtpuA&_Py1F+-4wIT$&;lhS)Nco@zcq$8!!)={0-Y4ehvJ#ru>H*N2Ka
zl*A>eWyW~V9c=DM9aUgTiS5EVJ<~e9tUF|!OL{y`p|=l(Q@J%K=i|T`1mO|Y7o57-
z=rQ!0uF@C!2%#Th17O+j`a4PwX*+Uiw}ZiePZ4ivIAVo%S$hfWKwM3^1GQ_n@%s_w
zU5q0;nzvdACcsrbu#c3x9ET5&%g`ObSEziOcy&I4A#P4;55c|#9Uuze;Zj~vbHvN~
zHtm46LS+JPaynl}vhhS6K|Lu9U*1{&=du%j&f?yCP;sD|a_b@9WLu%_?{nNXFu+eZ
zhUviTYh*h6_5mG;Bh0j%rf@d1CJ(l53u;`~#i{C@d@seP3K5uq2o*BQr@W6@1uxKS
z*Kb4Nh1$Yvq-pqYagAEt@>11$7{vSv(@p0%OJ{lRz60$gdn<MKf&Bs;sa(%k=v-Y8
zfpF?0>k^%Epc^_vvr71un>WS5iZwETmdSTR6TvFAw6ckALiE|Od%%}LQx8H6SF$3x
z?4?2J*r9xSChC%O%l3wS+k%f9I)?9r7&8ncy3q@Q4rT|hbfL<jHxaO0&{yPg(I7ZY
zaRAK(qI*tWj`&AA05@LC3)=fzTU7V7Zy%>U^i7M+s#JU*7e&@0f5iN^8`xJ3er7GH
z(|*zp!NrgR2q5YpT5VmGYOr>9)kz?GD<7xXomH;<-P!rwS>=wGAC*g8e$2SVcW0mH
zAI5p8FY7i*$Lnh=^!+oQg%hq`t;tK%F>Wk!4z@80n5@u^piyZ7P>u!x6|2r1x`+d;
zJqiyn1}!H(bR0cbBCD{Rxm>6zbDdfN4HC}33W=;PEWnO&hIoQn>=AF5IF6YDcVPD%
zjDAgwBDY<02fY&wWg{A@5!IpJW3w~SE9yKuc&FXM>+$9uVjqx$SQWBH5^eSAk8f8k
z1DVg^ylAFuwLP5GTg`u)$l9uiaY)$e09K>hhSkvn)ljec6f5fX1N`EIO*NfZapq-K
z<PKcVresY<PDyBL1}Z29_87J0VDe#$qhaQ#9}V%7Ww+Z96%4UbiHxyns@6U1VQfgo
z(5Q;waKR|B>q=?iGqTZUYW$cFOioFuEU;KQEz^Z&Nte|z(;yZVV5aCiBvC`{eHsiM
zC+K!FTqZmCNbJOJS4#+Ky3$*)HJ}OOJfl5D`AFG!UhsZGJio#jzR!gwCc8c=ofkIA
zti=e!YFFnZ7HxdU;PC-=K1$!+hOp{E&Zw_Bu?QNPmUgkKMN>}V!(7GOz!jBMzU$Mh
zz!dsDh!6EpJ-~wiv1*>yLB|DH;G;*MK@>`?IvNed>}s~UupVQG+nfD39!6WW+J)7Q
zOQ4(!X6k~OdUi|D%#RrPz6<rimr!O|OwI;5hU>G`*8CnY%sDb_%1NPjl|$08p)}KY
zts?j@+K6rWj2+`(^=!X4f^3Ey@ey*yj)nk&Q|K@VJ5d$S)mEr7+xZ%Sh14GgzB5O_
z^>Y%}exnJgcY>+l5dKk?C-?>kYWG&mU$5p90(_nxJApQ$lo`uITa_83B#xY^@6EE(
zHnwJF7OR$ZABGr@*Q8f+Qn|3pI?mAXJ6P1N6HR@m{-8Po)!n)q9Qt%l%AqLE>#1+~
z>{8;CrVbvPTIki8WpwB>ZKgE6sqe_2=0H}$u;Re<$ZVLY@4^qE@`$!V%NEOCX;08w
zv9qVX@i(kV7I+#%sLrfZr@j+&7O-rzyit|-g!#uHN)OItDE;yDpBjHS<{wG@6jMJ7
zf0BAqWGeEGl<LgXci<0K<!BrC^FY8(ebZ~&A%pPT2oeawCv`?Bdojgd({h~p4k2YQ
z4-=nR!{j^`cb1MJXzvhI@u~04``RavJJ_`Kn@TO|(A4+kkKn)?cUg<0y*jf#{YS-5
z0Qe1H)4}56z-K&3qQcr80nYQQ2h)F8eDs(u#!NN^oyaMHviO+bcs$#kRGpcA$-H^{
zNSjM^kYRS9;EMM%wWgeIm+=E}gI}l0IDecmq9C@7#|-O_IUQ3k4c=Q)NQNY8Gs!1-
z#bV)?d#oqZKf_q>-$?7s(1QQ(`?C*6t-Uq;_;7EXH}}&EYWQ6~l*g(|({FQSYQA4C
z%G4mFm!||kEC2yd#OmScg7jk2MSTis4^rPoM`zU`$`<(FT*iZ0&7wKZc+g$mTwC7t
z@2IKVvN)+pX%)CTS^r_)q&{V0KojdiG#Oke%G5MrwZgB{Y-Y!@C{xoe%G59}$v+W%
zO>w;>fz*=5%<%Kvf@ufdXyDgN9Yh|3-pD!iLMeb$FGYaUIK5$Pt`qp!cr-tSV-wel
zSR!7D(69;D0xso#?t}ndvN@+#g^{qPDK%8}sO&5^5<jN7nGh~Su;an7_6dTs6;X;R
z%ax7W75bpVQ9G_7u3D6-F~N61#ld|_;IR%VrT#BsDd8XkOj^s{G@RNWNdO)ID5u&E
zYB=(uni0^ogQe;ckbpJEFUr&uWoo>r3j8<(k5^4M>^HAW&BDsYN;jkSKujvAhxSW3
z8E8lSj6b{nJ)$Ib6!*ReI0Sc6h{qYX)(Pg=A>OPvVq&{MU}DL_CL#u?9Y8jkf|5&c
z6L$E*J*Wm@uV%NR+S1~}S{13x8>=AP0M;#qW>;WP13O2zx$cmd-cGyky9jgR#G!%$
z6xyL;(AEDku$cn@(HCh(1kcy~Q=pclm%z#x(6CM`VF^`)Re-0H=A(;zoyiRV*o;{m
zHWN(v0$@#8xnS2vw}CJ})CiX3y%31-_$cimbs6l921DMhP(1RUOji~BIEVb1giAIs
zAC^}E*lLMW!ebB1;f#tafEyuGy3z^&U;SXw8GKNbsX2epiZV4tnVO<Z4S}cH1&V|T
zWJ>@*guEi{&~D~=Km@3q$bQGK9=iL;7K1$?M*x4TJqEnBQC48+hVAv7R%p-RI8bto
zViy2)Boju=8w89M8suR`9*lr@7`y<^RN=PIdCvoVTO)T{LG*N4qNi~_31?w_HLHje
zWoo`=WopJX%_1eMNXa@gC9A!@vbnOj*ll%Iy9?d!;>yCp(z3m@TGP40Dht49$JODH
z62h5^C{#3PLa5{#Hv~Ytqfv4M*fTt;>P?!|XdV-Z_l7yh3f)MXL8;d&Xa}Y!aoRA?
z=cajL!{f6|q8}7$<IKQ4zT}GVh3NxLVi#0*4U4B?0aQb3XyBMq4Xmu+R>l;e?1C&W
zi+zNSv28zpy!{E!G<4TPc2A6qo{w(NO-vg8Dw0?xq+etv1>Kc2e)O9Ig5(-Gc-elj
zd(N7YD)KmBMA1A$%1{SLVs*SbFf@@^f(f&Q#mVFzrK|x?SWSOvbdjnFhU~$S;<9UG
zzM`-0dAbNPU)6DWvN$rIBgr-H>pTzWFGQUgZUE*2bdSX3L|2P4HR*0F%G7*a6D2R4
z(xL-#<C(PMilD<>yIo(CKO_#29LM0YalLP}no3aKg&<Wvl0%`hpf~5dM<gI#mdeo2
zX*O|Zql2%6eW2MqN2(@?f%)WEl`z!M2}AkhzoJZyIVp=WHAR`4qD&1%2&G%f;6h>c
zK$u_)tN6NF#YF<AWF+2^D0^Azrht1HIl1+lth;hYey%cebIY175;%(lPL+jtE~fEe
z3R)CMBSDg<qZ#|(&E`73FUUw@9s9qN{hSZ#ubhjw-21!Vc;}t;^%EhN<IN)s^BlzG
zTW#a3pUe&b+qXNy;n8=zq1a%yA05JAMhG-gUT=xAPCZU{E(0?rrv%s~6bV2M+{hkI
zeOG==vX}$JyFrKd6pkto5s3Rk5o4H%`RUd6<LWTSkLC)@ae+Dhnp9aUFvnuk2t|!{
z!kmksR#cks=JI^9J<vj#Iv}jl0I%w<KZYw-Fwp{YtO1Y8Gk%g7)B*BMLKVw~4pjY?
z`nC=DCvkS0S}z%BYq|7#aMqF#U0PNr!b96Q_Y8<LsbfGYO^v>-CqHGN!4bewuo_Bx
z;G;4KU!rc<ZSB}E@#h-c_58Oweqn(*E-=U7x!cm%ZsU9`Fvla3{GvRKlCLIe#XY2O
z`pUCz_??N|Zwf<qE_ah?NF{pog{+Q>=vab{Ofx$Xt0PtuT*5<OfgEsmfyylFv<Fk#
z5Q1OL40!y=ug#cYA{G#3eFE-3Ma~TBGX{e-Q<sXIE-SXd<OT&U)dh!est|Dm;16js
z`QxrXQV`9tM&Uy|Js7I9sA_#G;YNXfV!ZUYeWKvRr*Oroiwn##@TKEOkOFfI%TI#Z
z1?CtBd4V}jn0W<Sp69&Kc}k#Q6UU9w)y^KY0&`qojtk6jfjNd5PECi3kLV+5qD(Xg
z6U!V!WZeNdGqf#o#A@F1NU&z$#v2KNGIYXGOA_G8OdUEMBry_N%_J4l*-xsIpcF`!
za$rUd$XHpUL=(sW(nbAhz^qGk-2iOaemokWVPv7Un%f<F@H9>K8^Q%p0-^p6>X9I<
z#0>oV&?3u<-ZI9gNX_Rzb>n|2FvlbW^*nIXmG3>d`v|r;&W*_t)$FGl(6^0c!_1^+
zGsm3M)U-g6q)avW;Ory`Vn=ZLl8%Wbg8~<|Zqyl8`|-e&5-Vs0{@3V{$ObiW8vK@(
zfG>+B8ZSOHu4xvS;{tPBV2+7#E-=SDtP0FA`QJ4GydpqIIo>@RMEnU6ymlu8;5<e$
zFBO#&&B2abcx+HR1Z4k2(?iXWGGZogRhuuW@G@k6wqlv_#F=~G=@BMCPLpva!7-Co
zSZHdZUg|WM&9bcut)!}lK;_vs$p9!Y#|7rtiCCJxz#P{$RKbx(9+|AlkBDth%>kda
z3YaU1l2WM14cedHV7U-%e?M+);h@pSZ35TMfvlBKF)spVeK>;YV4S*yJ%Rxgianx5
z%#2aaTp_Vn<*k@6suFWlnMn6@iULWZxtS=FL}Ih(JUqC7Tc9>NO24FaJ_^infjKTP
z#|7s2>u42U0&_fX-GSKl9Qtt1Cps+MzXYoA!kE2_A>v9beH#oLtYa4V^_x%d%VYVu
zii4sNbfLk9K0w5o-)3*&=m5&N0nzUv@$Wg@MxRAanC~ghp)kwlxZ}`I=7@p|iwR%L
z(PA!vV@Yb66!mrrxOM^8o_@IYt8aZ#x>`8x`H=Ege?aebY0D~gN4_}JA?`|qP)vQx
z-({;wc!essJqM+%Sv!iYPxbZE*3|dn53MgsQvZs*`55e7iQu35rvEOiB}89#jLB7c
z`En}S{Fm9(3Mnsjhe;6ptN-`e@BSk`dG+qflC#)x7B(y0m361GveH_pY%VV^RyJE3
z>l>@9Yn$r}o#Va##A|<he7PO({bzjs^7z$vWDbvC{ZVcT#|G#)N|QJEZh5FYK{IFH
z#fSZWPA}r`((bpytbK_*kE_Gs|CgNthkqfJxZwVV|5#YipIsmzJHBy6I|l-f^^aG9
z3m*CV5&TKFKm6$aL%M$_UmuD$kG|yDOT~c;?r*rws9@x(*%Ii9lHn1EQc_qp$~B{&
zGg(y>_OZa>3oqsvtlkWrLDXkQu)yILIQ)Y9Td3K$z_hEV*>yN!QuMFD;TJgk0*6n#
z6F52-Xe-d&p{h8%nd1bd7gc*onrVrMs=K-hOryZz7u??jcnJPZaDWB(H`55M_bu%+
zIJ@>_fy3WwrsR|Y05HXzPoc!*nNOwz3k;47(f@SUuVnx5`=*NB_v3gNZPjXL(ERf$
zzzo|Rdl)<5pcgiXffM(Gj>_ap+>WQz0ANdC*PzTO82EjHQo%*3DWN+Qxx*k5qsKym
zSO}sTDM*?oV|ezF6Gk*nZKNh6AH%~x;O7kY0a%9UMrriWaX#m505a*X!17Sw@CzJ%
zfy2)!BIA^g1<P3$eV_iUt)>Ep50yBECu&6Snuu39Hgv6%GIE$ZD6<8Q-;PFyfgISG
zRQ3d3L!v{4St9UFs<Gs+P!$Lr&pAMeM>r&%_P(MmCfRQl=NEEV@b28XCxBOY&?K8U
zJynu}h}A`bC;)SDY!^8E0*7DV@C)v5r)oVxjvdpg3LHN4WSwk?Sdvu6LY82Xqyg7q
z=Me5ggnL*o%mG}?X`=`X#4sHsQdx`g6=k$eFzHJobpl2vMFVOC;`bwDIEuWE=B*Yg
z(je9`u#b`oT^SS5wpwnCS3huK-cAXQw;<60pfMN@!9LmdBcv>{z0~-1#zLM48Z~HO
zOW;jTr>QMS=?*d>PJve6K&AH%685&k-iY~-5#tG`;;rRv!soNpSZ}G&gM0>#3!h~7
zB$eH=p6$;5s)1CBy<LFWc4rUt&(AYAur$t=wUPP6_J)0X3$+FKS+y$deu@g_SiUih
zDLFPT7l<ox_yrDMQ(qJ~`~rtx;P49^eu2X;aQLr>!@pb#!xZAR&cZ6rzMNyy0;$pw
z%|m`&<m8hVnBKt51SB{hwpX9M`PtWmJk>-m$3a`;+zup=$Sd2Dz$`2hXICG-S5z)e
z*!k?Da<Mq+P)99<nW@Yl1_dQ@C2mGt#kmh~xC@idK0t^^o2{KuNOLr;zntE`4Yq*;
zCp>VSLt2RPBndJxQ#Q&KXijg^5@C`>t9_pa?0w>%jOkjih?yFFy6T@Ul`XaZSXwXV
z#Vw+jMfB2Ab=+r>GCuOV_Q3UAJH+A2FhCK#{Dwy_&x27Z*$(`cf=4Ln0=Hcj>u~aC
zo;6RwoeSWXj*JNMnx!t0+)7dv6r9H2a&V*%7ujxuCna>9_=qP<icnC5E`})*{<^&H
z(l{-7kl}&K<q8(Je7kcelOkqQo>m|&;4o!#@MHlH<vf_fBV2&bc`%KkrA6Fo67nSj
znNFOR^YEsIsG};<S)7=B#0~5A$n`p=ZR|E*Z~J;g6dHq95(%QpdK%htx<bp~I#bV~
z=NRS1ZSsH^=PwZ>W3N@QIzT@_ii~$5_gAh-Tg5iP{kb<ny;Va^FJ;TYM%;ICmj;Ny
zrF_U3qD&m;(B&x<R8?y@<w)9+YOayvbt`bxlA|K<lj%p9BzZk}lC5VY<+o*C-M)Bj
z`IGYB;wR2b3F9&x#p=uF<~5G+%Zif?omOi}smpPYI-W2JvdWTg)U%}YA=gNL2?p@^
zV%VfGnd*(f#EjwyR2#+<p6{2mNTygFyY;bwFQ4#N*R#FWeSI$TnIRBj11#$f%pYoC
zI=dlVNewOn>t?dK5<U`{&}w=iuOnNCFc^s{2#_-PgZvHvLiFqBOVg{V&V+bLGm^x9
zrLUGmc$Cfoz+G^7Q!l$lave&@hV~%Bd%4K;hU^dl>MEury>n4`SeO}Qxu`e;7N*)y
zh(;hnfnU%72}^iKc>T$ZiTweBZZ%(zpstGJ9x5xVBg_0Typf+WP3nkv`v@qftXSh?
zZr!?tPz)YaPPblFt%PR_&#R0)2<iVP*m3U1C`Ghwmbm8pFRL{@w`Hj!cV*XabR8RM
z{3z19vaq=}&og)Yqn2w&^?O*+jy&D6%4$Qr@nh5qd*jDsw&5E;3sAI%GiW(d+&IGH
z9tSaa5+7-Vn(cj<d`UrPS+YAQq0$+SkVGuYOV6L293qr-_3QjxraSa?=MpD_o=-lR
zf6ARO7-C$|fxGIb^02;8U6{uL1Nl3uI%Mb5x+uQ+G<wf*4?)5HSfEY{)JcIlDNrW`
z>ckRVx!h8Wkp=3cy}z|(rG$Q8Yt+ekEYZqdB_vC!O1r*x5N3?d!w|NR;QvEQaSvue
zqOhgm>y1b;>jgcxJy8+QTF5u7?I;=zXu1fNCe@^2iKz0OeZX32e*pC)b_1+PqscjL
zuOCNJBkP9P49a-zxEd$x0j<h=2hg)%#R2k)u0cIBj?YZLGA;Lkt`a(KwBAxinv4UC
zCA-rAk9X{V2fUqA`!k;+T1Bi=J5H23u|={epQL|xFi?27g41eERJj>Up`pw}kd02f
zZUoyOz5v`Q+_I9%0Y|{VjRX)36AG-6p>6FuNA#>K^^;@A_Yf3JPsLU<ZLDI4!~iSi
zd)DMkEJd0_GFS>;3~aHo%dFN>;zu}eZS@jucY?OoMY<mR5UNQHn2!yV^%h=zr_+Fy
zCnk1zZp<W2Ykpt!5jubCQ&R=$An7GL%yndDS+5ew=l7x@0lCp1AmC@|WyX}nQb{aS
zvx33vBY3$JwyVV1#$jyPC4SNgly-LLbmPV-c0EiN@@~S|>N#;k=k{ph7|RKtB@NP_
z8cX(`Lq`p)e^t@|_&&NzI0S0%%C4eqZbwnjb}=^{waVD(bb%{79FC)t?^k{ad0Jv!
zJ;Bh0&aXz;Xi<Z84V0%uwmp1U>S@X?lfO*t9C8-3#o|Yyoo`L-V&=u#H>RGOv?8_f
zVwi<ULmI&v4C#GICzB7CZ*`;4MaY}-vtboiJ02DyJr6@#`WeeFy}yj`vpmBjOTG4U
zEEnyn=7vK(TKN#vKI)cACH2!6*;c<)eb93(0?)F(7+(`FOX<2DyR^Kx0DBYRyzzUi
zT5fpewyI0rm+x^5Giz#+Z$~#&PsZB%%F^1x=H}AE$(F7xZmujZtfa5bwhgUXJMVQT
z<fn>K2(xV>>x-NSN5?00ItUAr7?=8aPIz(4Q=|YAfO$Vke6Vn+GT6(t5C@PAbbQj;
zsw2IG0xLN)9+t)7(afA4X<>{W4&70!iV$!OQj!g<CC#k0yr5NE*;rdz>nv_|mKGN_
zx|>Uvb%%_DYscLodi$^&g&v%XU0hQ=I8jG{P&nbN|8rTdKPSiP9<ryPnlc>;@h00M
z)SHrS14qykBG^;xfWAgnuyE)riY7Ttu^$+nLON3mYFyXFsmk3k79`-7Q6T~o5TQcG
zgE{YGR;MIxM?FVUyDDe9Gz}jvuCdbO@>11$h|--n*9C8me&T;D4urs1_Esvm1N()B
zWniHb+Yj{)PJL8ALZ=+)hR)C^f_%%(o8n-FMmvC($@W1LL5_e?HqlL_gMotKfG>ll
z9)uXKB&J4)2EFx}u|s*IOw=XmmcZyO__(2C_zu&CF_5^i7a((a7~+-4HD+%jV7VYz
zh`N`#(_^8e&z!t`T;EE4`#9~PZ%QsirQ-Y0cOz@j!jHKE3f7W3?Tr}A*7d=~Du8Ht
z5+kXtE5>+Bal~tjAe&cE&!!dZ&MJ32G7QoL1N@kAi|@`p&p(VktS{>}NyqDJEA;&{
zo`n;xUag68lCICjBIjV62_Vo_To3}Api?;-1XQd#bLe6e!AJ`U)M+6_+d*!&%$4A3
zApW>qs3~)u3Q9-jUxh?g7Zza0I72)^Eq3HPaM2@<W2V3z*gce!wQ6D%x$T-e=$&XN
z8_`gWs1E%eo1KYX(L26_HD|W~62hs{L+k?zI#z|Okwja4`s3SG>lYd_PeS74%%GXF
z)%Kv?=19Fn&w*K4MQS{ZF<2eF9)Eobv48skesMw+XXWd$;>^n|6kc^bn|4_;aw<QD
z)|U#3ITFRhZiN+vEsloCB884V8saC*Znwea#t<u&$QYZZD)qm|s)wj|t5cKD$ez~J
z_%R=t99^LVanw%BbRnrsVx~bXs<eEG^6AiE=r}>Q+ep!XJ1js3L4Z#UaEphaI>2Gn
zLlZ_e5v$5lK2r8wNN*U6>2rqfbD?Dum7EtgS++&5z-m{9B^E7s10cN*$TCnxq#O1R
z<c#{76N|-WTH3{`7EL+PO}L7=fpy{AU7uzJrqJ&}e5i-&OxZzz7$0+`P?QB=0jZ9n
z0+`WgC}vl))rE~5L)_jp91Is$J1)UCUQlKil-bvVGW%~2fBI|h@XyQR^4<;dzrM@A
z-oszdkIR>!aQ(sF0`BSmC`))N&s*X1_*ef%$~4R0Pms($Ouw)3CO%MIlz#D(<g-h(
z?E<{i-`WYLf=u{~@q4wDQ1tT{*)c|+@)`2<?gkzuDMT_(PC3-4zE8ESnVH3^W!=~3
zY|<+^ZCxOTJI)Z?V2<B*adym1eTNpH!&`TQL!Z7_IUVy}Z0T20uf006JoP>L7w8a*
zNlbk^e@{lcnS(k_eWTxP2^kC$pz~W=sVxdl7fpRHexejTsMu5oCz;}DFk$LDQ>)Ib
zV97r)`}bsw0du9hO67U#JM<G1AvrG&HnUot`p*2o0^D$NKxgXP{yQ{66VH5gW^MY1
z`*$ttvABLzZ6F$AQ$NgqaMDAr)QYKZ`Azz?#44v8g-;8Cw+@l(;xQ3^)auNP@s9}F
zIrSAyeeZr(bu9((KbU@<`zJsV<i;OFDv>5I7PL4WE#f|or@u3|B)~K7t(G%;+*z&N
zx$GU6B0ZP$Q@_C8r4yZQWj6a1$MO}6+^ef}@*jR<$|{|F4AMGbIkno+0S*?r4Qa*F
z2Mf{H29FTFkAKRnZ<gLL_2@2>C60TgE*>jP)o(Jz2IO^2GDl#t<20bhIUxZ@PKbhf
z;~%lAp1j$Q>lZ2daYJhEF#>gf?2rzel<X8e#lRW4Bw)qy&)y-pQ@XkH;U|w8_wGOV
z@D`8&Mx@i_J>Cr7CB`J3xF$maTurl*F@NV0bP8V`C&23rwQU%n>R-rIz=1Za(K17H
z9l-+pZ`j%O^@ro*M2GZ^RBgZW<mWQ}yMCt&G{~?6Oa^dWSZn3n;0v-Ln@FJIB3<M$
zhryxHbID<h>LFbuv7mmh0&;Y^)jl3ro;&dXR?Dzh=S*&vT$Xm=jRt66a%)GvKXEHC
zD+44xz<GniK0>7I%=epJf;PMj{)E?=G_yZSj8LJxri1x;7R4PW48gXANJQId*+?oa
zi*KHUW~7^xps5`pQG)Ri2}`XqNy54b*F9iUw^Z0fFitH(5wTq<EEINX06ybg{}Z_q
z;nI7os`FtO2H3+71KK)xg2f%*v1Ze|XVI{lZdnOir}3<e{8vRaLBJYP;UF!f;^n+m
zo<%wdQrY~T#sT50C;F2|lFL*ke_QPIj_NzX@R9lMiT<$6>8;PQj^~gv$QdB+A{jr-
z2U%EHq+G&x_-zwFLd0?#;uDJMNEQj<@rs3-xdGRg?Zrp+#p=Qwu0yk8tl|M4$x_nt
ziO4slqVQwLwmYIPmem*i06~}fP5E4YuF^G+tyOsFr{6=b=JPJV2fnNaHK(tAm??QN
zBsiiz`{4_x4W2$ZZm@7tC|oLjxoBjyXIdcFHPrGZ{?+f{=G=pKpf6H3^W1XOOK^z>
z#3=+J?~n{tRbiIgFVBy^0TGC^N(J87lX852lg2tuFUR*~M@r6U=d0?3GkNBeBY9@#
z#<A+w-@Q(^*1wD>&{=)^@k#Yz=A`yeyH<VrZ;?A+Lu3d?a#U-7UOPDIuOD5j9{m1d
zXL0dGZ)0Q8i(W)g-*2zBafV*6J}mV7+D<JV4EJ7ig3ZOYQ|m2;%h#$8OHV)gH;-Mv
zOO#mPSD{Il=CH#vIk+cO=WEqJz^dtWGm8F9qO5B@|GG)DcQZ3{s!ejOW@s^;6Kz1{
z*WXF==~{K~#|GRtVGU+%z3bGcNfnU0t@dJTX?eM_w6NZ-taR5~m5rs%jmk!6eaYTf
z>#TK`+NuKb-hTx+Mb|vu`+uPg{6!ldTuuSG*pWWzkeqz%Ocvyq7meo&n~QAUOK*q^
zF0rC?^x5bmXY?2U&u<nE)xx0)R~;NVMD)X`zWD^dJeHrU04+6wF7Udc4{uB6H@cOT
zZw79#2%crSZ17{5Zn4w2O<%`1S2$FGf-4-VUyVa`TG@$<jZ;J-ibzBei6~<T0*|bL
zgCV%qPQ-ioZ3I8H9LE`dx7;Nj_t||vI7H30o&%Bq8wsuBj_)V(sU(`iOCoO|Lk7ZH
zws;i?B&_15oU8p;BPTp6A`wL-;^C;Zw+0jU;U3-IU$%$;j68$AzXMVvb=O}gRpU$X
z)SuDDSDUi~P!~&8TZEYu#6m1;fm{$|VPF_l);lH(%e-};_>!qdF)cx9McyJ(+3X~R
z7ErS=uu0X*JUWeyu`PpfnEF2bVFtD+X`OJiDp1GNH@ymcLMtRF1A_dKu>NuuP9SpV
zJH~uWeSdx|s2J=Dz^{;cnXWP9Aqx^i@vcsNKUQ_xGWR-RV=DPcr4!v(KoL%fh7RdD
z+?x7s{lscc@NYI}kXDprBwU+qU&WrGcus>VTUPT#Ky{Wfo8hFXpPWCLoomi4S2K>;
ze8g+&J1~2~{fh}1k8?F=R;Ioq?`0?9^x)RCt1-Wy9ey3sYR;@q{qQejhyMaTw&|C<
zmCQac8WwPs&6$}U(w+z@1W5}NfOqOA=z~Oc1QgjeyB>fo6hI*N2C>8%yJ@~Or^*#0
z<S&q{{WD#^MAan_#trhuyWr@-fs<I?X<{^0PQ~W0)q(EFR~$i8KaoG6vT0D?Iefwu
z&(qaqGW_Pulm*@n!Sg9>Hk=>7g|f+r2LTE_A$4Io;rf$YTiNK5aZ=5m*3-8EKcCbH
z124Zw`o*|=PJ(PO3xE!{mcd>-^7kX)&2NAB(fx;%d6TaPBph`d$X0o|QqSY%`>+fe
zhr2$o*sOQ5CG1BJZ*R*o(aVGUnA@Ws3#=^9TVPMz1m_JYdAN)QP6+NIBb#sA(Y_P6
zh?@tIt`L0EwTND9<3hJ>Tjkr^+jELs0CztWh!(iFGB-YO?B2++$`5ZpAiLJ(x;I`F
z2yphHm+9Uk{_eaW?G9VdYyj4-A;pu-0I>KlpO+SvmdGo+9oh#jrsWCn!|s8NoVsnl
zb7cMNz(0hj+jr-!yN-`j_ZOsAf{ltxe-`w8Jn-XS-g10UJUJvMmeFE<K`-PqOPM+T
zT0ywgijF{mF~GC!Fy_%f`Zgx~pn?jBpg@0A-oCF!Yyi&+sj;Le9uUpK^-9jDw0-Q<
zoew|71O?$fb9-XzQ-Jfg?=vX{)+c_z$`J>!L*g{vopeWr!5)yDjn9z*VTxJ|C_kTc
z7m%|SW^rGUq*}V)2ov+IJACVjw;<2WWg#89iDK&$C${xlIYM&ZlT2%+jb_|<!=H~(
zgh(!gcbZ;+UyV@2$7R|HbR1+c04~b)`IJizSq+X8SD&pSkaD?Yvy7p<8?>deJ3`xA
z<I6_o^Xgk$P&*ZgBTN7)!Znk#AanU0)0jBY|MmX2``;O7tm^;P!~XaBzmuV)>VLog
zgZ>Y5ELFI*g#S`<s(wk4PW8(V`*Ud=2DkojjE$=QJ$>bOv#*%+(v^uXef6y`N~r;I
z`g`Zz{+oY5R6uFVDj7?Nb=DZ>EM-jfue3$)qvG3GfR$}ZgQaNg7bVd|uV~EvgyVIZ
z23&gi@~dc8HhTpt_pk8Z-)9%!KfHRky|}!zxwh7+I4douva;TBDx0f|OO<tJ(^+!X
zIvbm7?c@GGzWVL);XdyF)63&m-%&&S>JLuq>2Zn;?f*mnAN&8LH^E=`|2c8D_5Zs6
zSD9VZ@2B5<zyBu7`Ii$n(rLT*qVENo^Z)u@VN_;7if;k3ILV$Xj7sF}?tscb+lBXG
zp_p7@d<t-g@En+mZ-r4=7?p)lIle2ten#cAf@cd64w#?vBNl$aSsqQ}d$I5f7Jk9P
zFG&81f=~v$GD&Vz4reVaAK=XshkRqes(S!Yw;WVlaf1+9Q}V?p>hs(B<0=af`W-UW
z$Pcr(6a!8H;~ru3P&$HvB}6emd51cH2upM#g3Hh5S^1_B%3M>w;IHrWGkn`$sT=Nw
zb$*oct8jE>!QaY7`)c#GqoY^U;L`a~*(++<Z&|r$@R!|+FU@QBWZIx0Ia14u@>xzN
zDCA4coYX{(L~dSypz((F7*vdgKC$#LDF7-AjS9;~f<*xUFr3WiZXtXClE$cS4;>IK
z-mvZn31j*g{ky?f2#ORXFnOQn_+{LyXd5F^Jeu2uSnxv6$lRZ~U4cB4t}yh^P>gAW
zej{sM>h^0?=-WWpyv~do1kQCMc)X-b<vuHU0WAXP1ke#&LXt?7V+uuOWKepdz%yv=
zfV-$#WE|beB;@e6l*^Wh34p1f2uNU(k>tUsfHwjZMV>#j0r!Chm%I=aPaLmXH2_Ce
zWL3q(P{z<1ur^@ipwJtz87R<+BouH4fIU*+D|xshpV-Wjp3XHb6YFD(B!4B_VAY*O
ze);I6)bbKAOtaZEwT>rbCnUV9r)=ll*&J)s?yT&mM*#nrp$-9KS-m*lYL;`#ppRAB
zN}6e4wivb{F`>fKnjqaX2;W^Fos*eNaK3Vxo|7|Lrf@c=y`Loz+ulejv2mP%cXD1>
zvYR#fmJ$m{DO8a|P=G>=mt{ijMka7ng?cvoNVyNpp^ojtvWh$S(Zx?otMAQQs|)yR
zoj<WiB=E3ShJ<#GR>B*TRy;?OQ4}WBMq4=yJn)?YXB*<_MrO=D@~~NKELU1AM?goR
zv}Cdr6z(W?QIHe2p}LQ`(-1uuCiOMpXnmb?C9}?_lZ*5^n2r<B05=eNPdP-#;_z<~
z&2sY9ul{Vp(O{TC0j?TC7$6O=Iy~Y}Lgwwo<ZqL_HVV08G&dy*J*(|+0wEw4XF}`W
zJIVB!KelE`c6g;ttU!hJAx&dz7Q^MtLeR<?g5#XOQ<S0;&(xRCue7OiigG;;8m48?
zr%lU*HS!}sc8351UuiQJ<Wy4UvF)`-pteMAj6SfElWInPf_ed0+S*x7y@|1fRzuA5
zuyvFf*J>xO@|3;OMpovYrTb>~ISTXIep&H^a@$kK$(;FD+SJ<2(2J|88gUu_MDFKK
zy}Y<Ezcx1~Ur|t%r)2VLhIdsINhpkJm10erFAi*P<N)uE3IhAi)h>eh`fLDs$>9a4
z+;hsa+j>*|LTQ2bO{;O$m9BvwvyCsLcyf7`jnmCO(jVz|Uu_#v9#c=UZe1^QJ7p;W
zAY&Z$gTwONRkra23UO_9ZF6yBZL`u{Us<fIEG@Jvt=2j?KbP&r?nZl|v$2Hp{vU<m
zleojZ{}0TbfARfqok{O`AuZxU=lG>g(hHwumpx4`D9<k?Q7<XejPSUiRJqK)zSz#X
z;M)FLf=>VDx26nqnz(=H$12>{r<Nef7%heSx^Q2={rW_O`?_#n(?8BA6g2$8ecjj2
zj_c#TzT_5u8(#UHuaHW*G{M`K)aE-)QP0%qqr-2)*_rwd$Oy$qkqVxk<M-l|jC<)!
zbzMErIG--0u<!aOZ0++rFcUR)^=gTUjCny2t2)&Yp}rgQM7)mHt{!x*Ij%0tTg2+K
zF_So}xT_0Nt5p52%+eCednw@AV_k1AA2lMPkFlS(On+{oP}ePTU4*@guvbz6l@k3c
zEx~sk!d?@wf!`)e*Ov`rJ0*+Tf@*?the8pis9gxnAfbo^x`OT%wQE_N?fUFG8GaAo
zkrh{bRmf>xRBWJHuBfS7K)0X{GeOOS;3%*_f{8eirj1{sJ9b4azL^3U)cDC<+g?Pr
zF5*4_6+#tYI==xYvs;cwCLD$Id2vN;{-KQMk!8PNn1F~Au)o5S%!XZ2ld4wBiBW#r
zI&e`U+x0*o1ghXiVB!H~8=zJQ77>tTuvAyn$_*l>utGhQGe%Kr<BS&#H3cPWV3%OJ
za6Q_5RinC?z|gAJHkKrjN}y8`F=a<UEMajbKsN~&U(vv>Br$BTLvg^8hO;ufFy|{e
zKh<3F{I;$N8NW{A{s?$J5LhW1!C9-B32>+QIs@MeB`kkMLt8iK05gysx7!8hGcb2*
zNEKdiU>_wYMMa`03|?2%=o`9imXJ@-A|OvzZj>{$hhI)&#FqwE|9*MOy!{|ABmY<|
z_|cvf1%TppG+UP<L*j|G6CepT9{CPA!@VIOhd4T-L&$6}g`h@#0NC|?rTu`V4vXnU
zOo*YFUk1uIt-g>8CpagAkRPD8fy$oL24lOvDevLxSI&F5w7KLs8Dc=L11Or2Ata>R
zXWrT(6nU%pZ{uWb@$<+5@nRe2JwZeKt!Y5n0mMF)l^}q!;sSbokN9W*XxDFpJ~0?*
zB(%QGI>g1^p?w6jwgzAu(bjB6IAz*SvpH`yo4me3Aa=yxsSx4&A(W6Ngt>Zy0E*F+
zUB885cVxxGsgyQb8gkLzfX5FF>vW`+J7n7l$xAG9cDx!2;Dy$xw&lKy;v*0Js9K-m
z*kn)8WY0sM1kOr0&zmJ=K7>wh7?)IU;FAOJH`w*j2p*(nNJJ6n<7A+&5C=U6x1(4F
z595n=Onf&M+&v?&v+H|7un+rZAGqE?(BsuARsm>;TEMICZ*5ryO0H-UyC}(%8Bsll
zIQ<oge|9}gykUSxdr{T%?18;(?oz)s;$R;{lAk*veQ)GsOg?}IZ~U5aiS?Ol<Wj6W
z;L|G0m5tk#`@W)0t_7jnbA8*ZB@d~}yGw9V^4Z{@#G#v6zGd}!?!EQo=K1tnwI=JO
zJrofvWj;^WXT8LqYl9=%8rAU&Wzp74;B9ns56CSTwV3cEFkuFu0?ocDccHATdb)me
zi^LZ@K9@)K06d_oD>vX0<bDT~N_~*@70ZC5U|Cqvm{uQJNMkEA>A5-7Q<<GSbu%^V
zE$j6r$+S@~uk#vYR$)#YnNFzb+Ojwc=CgFFwq!HrrpjC-(}aF=n#^apkylfMC+I1f
zc#%xc7Pgz(-s~A>g~rmB?$$4KO^hLwebJ)9oT`(Aky+FeI!+OAO{ReuX=~l=$MG=Q
zs@3`^Km*0A3v&lKP&McTQO%4<?O>_8v`}5BIeu*ju2rUThmL5LC~D5<;Z7uZ$Niua
z-7t4mvoE^UY@k&zkst#pXH+lmDw%J!J+zwEv&x77iL8su`hl;k=K;-a>a-CuW1Q{?
znKbwU%L@Fe)X>3U2YV73T3j>H2>c!3H`TIENLQI;L1`(Ip{%zpil;3t`k7jz7V!w|
zv!sM1&8&K!zK}$Dk~4C@o0z65(Ff8U=akBR2ba?TCSwc-0PWX9N=GY#ScBkKGZU*}
zkZ6FBRMlPo2|Nh!Cpo*bNdM`=kyL^D#kPx7T2^@{$==fAgW>Xc?8$gQ*2la7L$m9T
z)L4!+3LoO>!H_+)J)9yG`<B{>$+&%jJg3n1c2Ov1OXekaX6s!$P#0Cx4O`Xs?U+tA
zAwo#P4r*-4uhIiUQ>x>1Z7?od<?~;lpLz$6*wIgD&OwT>o9Hp(H0V8STha@ld+j!o
z;$_MM9*J&fHA9CB1+)S(u~E9>FG_4%LI{Jc9p^n+SzlaT%BVfqkPQzMyU;Fthstw9
zd_Zy>*3bB}>)+#i?_}nCNv+Wb!Q<^+|BPGf6a=z0dLxkjt7QR-Ve9}dKf8qjKUR~E
zNxN-_FWiG_5cX<zE2=FmF054-HrF;*u|uH`rQ50EbjhGB#-r+xjH}b``!1{i%r18;
z<2~{&^Dos5PJ5y+$z<sf!SiK3FR4YB_7Xcl?<?B_kvfQ>$y5lF5RM>eKDItzcL>H}
zJ`frU5}F`Wdwuym9QoWWyS_LSf<s^3Wo9C$z@<rZd!xaSPuqep+fu`%?_gK4a9m_*
zW_7=tGQa8zHM_T(Xu1evdexxlU_~gof#txDDlSAqrEQ1&aw|Xy0tlL|CMp=jc35cz
z_~L^_XYfJOddkj<)S&PaVrug&%X(jZogE8~xQk(cQ#8UBz!R~vMnDii1|$=v+QR#{
z!5t2*{J?b%nPnzVN={}uGLk7vZ~7STzyM)*Idxn|$&jiX#nbO`UV!--=?R!H;p|R*
z5&G5C8ML^C=)TCHd5}7l5e^0D<c=A(M<IUs)d;@C*h!wl4&R0#wFmGo+M#NJn&za%
zH_dREnM}L=iA_zGMKhAs0h02VvL?M<e{$B{oJ}yE(8NB3jtquvmGP$+92HKGblx_*
z!Qpa_Zkmf5qOGRS-}PmQU@LG0f4b{evVS;)Y7Hl6`WaiDXDN27-LZ!_IxDPG*})>>
z39e3%<XAmL4sfyJ<b&lw%|@#7!4Rp0w!aVgE(;4Y!lM*o8bs%GKO$!(aY%Xg5$W8q
z5^Y#)SO*^diAl8%$pX=h>Rw<V|D3ns^wwYDOhQU5-p56rv@l|wEG$iQX%HZ*R4E2H
zals<vxpGM3k!V@OLnAry=DD>G`!0lKo33Z<B|Jgh%(n7e=ev_`nU3ILf%2T_1XKoo
zu*h)Q?m4Q)2p=tCIQv@ZGP=NY-wIA^Yz>T0l@pgy=F}=w(vZr)$TzDP4wk;t-iP`m
zITdJfqJ*V^h9(YIjE{^wnkVy-EC|Q19=iLGYEU-i2;gtE$C$H5S#_dedp!rW@8)nE
zsFqOdcH9U?cf{GmGoeB1iIy6LFI)@^zRDb}li%||-`2?8h8M{Y+&(h0g_A2HLuR!M
zT`m|711?aEdn(#Q@ezF_vlQmAdbsO&3{e8L<#<4zw~>$#mw0{zNhzBfI^n2=osFr$
z3WQWau^e`jQoj$UhM({bq{~e;t_Q+=o{m_3Pq|!m&NfB}`qh92sJd<d7n>iC256WV
z$j$AJJ$Ra)$3u8DN|&Mt9VbIvT3uYu`d^xqCnE--)ar)uA0665nNF@9D$kQOFWeMW
zhfX)Ha_Aq(vTiEJY_o||z0rYwQ)dzaoDh25HhNwF^(%y#N~O?kiNYgGdgs%-pX#gb
zs_#(q72X1@_Kf7xS5SRQ9@ULHLwOE9y*HxU<Oxh4icgvh)+6Zrp67U0`QDSek6@(Y
zvR|J2N55zrQk<TSX4ABot%Acuo$F#_SWP}SJ4qt<WzUFdgeBARoEqC%?Z*R8^Z>Nt
zL@%f{UR2XG9Iz|6CtE^cT+>`wdrt%>IwGr5>Gg>qHZzko4~=#Ofzol&(HKLjPjHAh
z*=Snd;ePAD9|9LyM{7p(6irfnuxh#2#NihB#>A0*LrW}lJm-K4n~1Gpz}$b@^Qx>N
zce&cVhw9uD1rKgFv0{?WWcD`p6M77<V2tA#UI!R6N<={>sr{;_PzFF2g;L0~3Y$!a
z^fx0(bl3S1<DDEkT{G)7^CPGB9{oe!kvRG&b%xksI7*bRpge2RGyOUmM5^243KUKT
zb$;vCBvQKvTXfWdRX(V}jm8GT9i<)gH8|K>wUv#vrM1rDW(Vs0Mt5_my}q)!vbfl7
zbym9z-R|Pb!UB?TmR4)((GhI+jGcn?TqT4vYLC%HMSD>d4~<$vIm|U9+6zopQCg&n
zseK9~PkAOe%UbX1Bi4_3Ql2%Q7pEvXn7v<U36A&pqI~j&=>vI2^H{=h03DNu`U&k6
zOF}h}>XjTO&@HTw9(@*BWjUBhM2EKh{PFfD|9^X5*W^Zap0~2BiZ!t<D-~^GNwGVi
zq`@Q{%=d6+G8kpKTuQ83a;fF4HV%gd8bAYR5@=w%8$-@+F;%*dZo(@2g1OSK$W5y3
z@|Ev=k-w5Z;8ZHl^S<Zw2Y}%WNpZnl9hFKNHh}Ka=kxt|-sh3El+xmc@vyP`v+FB%
z>nVjWVIv_Vpfir{oiuE9HHr{ONW2YfX%Q@db^}dn@S3*?)1lDK#oY#9)_8+PBsbTt
zU6-k9wy5IDEer}?R~8Z8n;k+b5PQyJ9mX#l5)(q<48;oSARec05@2%SsKS6TO3l&I
zz(~GkA#RiZLqq5F`-n=)EN&D&;@W<IK{4^y!bf#{ects3e}bqAcMBX!3=P=L33m^>
zXKWB()ilcVQjHNvgzKpx>fOWH&X=!Lbe?IyHd0U%#yHc}d~R0E)}YVg2`CzJy3xVF
zh!Aa3M^-EcKMAp&wFl@Xnc3Q!e0MO>B8vCOK2sbpJ9+JI(0nP=<TSIvq|!ubV!tKt
zAzu~^srm?T;77UXA=8#St*l1>6nisIA6OLZIIe_hOMheBK=grl$As44sWLfuem=&Q
zDNR1c{Q<sd!w|$n?Rt{^b)K)M3&`#{3=JVpi2gl;Ei3yquq7?*d$2T^T((|%!3Ffa
z7IHWz(T7N@;OE@Iv1ZZ1-~quzM)m;ygk&Sp-ol8VyMmf#2<!-ymTDfMzT(|xt^o2=
z8UmFdAx0(`V2!i)O3`U~ns{BCjcYCaf?R{O9&y6cKsxa5pDcgYptXO3r02=<sregH
zNxY-{fOhwRw^z9055|4}0Q(EC)oRl2PsxLV^NhPKUixM2gq<lnVcHOsDAY-0Qaah0
zp(zZTKCz+qy%sDzYzE#9n103ZC8IeP-Gcj^^xIY=yU&cpc)_lk?WyH4PcTCVusnlP
zIFVEP=GNMt?wgJBw@yyj$ui8flG$pO>p7F<+<7tEb)^^9QMM0)@EZ(dK1qYR>`(Y2
zaQEOX)AG<@Zo!`rgIr=zn%WFM=c*JV<uEge*(*oL3c`Xzs1N=Mpb|CDZZ?8KQ$erK
zIiUiNF8!2&CH@DD9Nj3hJAcMoH@W+&Nz!cE8yQod2Q8v<@F0Ry9LTbBnw`eDQ~!Qm
z$L0s-8yZ^r_h)M20hBWwjHSI^ar9*_6<2Tlenr=A<!a@TEL{AdE4cDt_NQFO^^M)x
z4`eylw>C?k*eNjE4%?#*4vTy|ugCIz>-DDMOvl(cne9)-F2RPD^$8StOs9BRuM{>h
z-PUvuGdrEx*R~M&<lSu-6rWgndHaERe|vJuuRj4{mf|J*Nc+>I#_k$G{g1~Td_S(F
z+)~kFf~lkF6h-dG@+a1h)&1Lf#brm?jKh0xuUG7(`kYkX*Gj4n)bY=sKCgZ?>n<Mt
z_K)6qC)f5mS2^!>`MTy*eEg^7A?*-k2Vs!(2~4C@G=ogkwCAyPSHu$PEC}6tM30UK
zRT*T-cEf=<g(cTv{Mb+76p5Rru@(!nTu`gO@zXc`;5UZ2y}RNNw+OD4%PnCdA^wg~
ztOe;tX?+=gzcwf6rK*qTp<OK`DXD+4Eb4>g-N=`0X|*`S?X)PIL);RZPDmHxPgD3|
z3B)s&$<V0p5Vz`<Im9iZi$sAt#I45ZY}hXg%j-PxBtz#Aw+)B5b%<LcL`*VEbI*O`
zS{&jQks^n<^&H~XA#NSw)*)`0lgOYt4sq)cw?sf^6JkCR)*)^k;uZl6O-MS#EtAF$
zaqAGb1!V1ce==L`5VtH7a)?`pxOIqI!5mo-9L`f<h9%wV5VsC->kzjNaqAGbL@L|b
zqbZnlxd`PlVCorQ=`7OKA#MeVY+<eraqH?(%YfUz{#qe!UykOla_co9e6I(k`zbWu
zUmNdqISu6FKvHB<QU;_VqZ=&du;44pkmy+7zap&f^8jomG2?rne*ldH>v*W#;=s!N
zS@drI$|XR<Klt1J@|{U5|K@k}LjT(<zY@sw9}_qIr+50_>3_F)<$L{eh}!?)i^`jZ
zvYzwg@23#SsvLL~@?H{)hW)2P7O~2~f?BkV;vTRnp^^stZHY`-wka<}Lu<dN2pW^^
zR_>n#we`=lU6ZB#88Ec+^y!z;pjoZx8L`WqR?xcE-rQc@-`Q%dZu^~UtF67Q?Nxts
zcV~06+uHHl*C2NJ?9ZmV{5hpBpM6WZ`T4US8ipl%MwH(6`PWhb@t=<!Lfav<Ke=_|
z2nG>_fEwHt>+=p-)UlH#sqQ$DgJ38GT8<MLygir}8NajwH615%xv1Z9B0Emxxy03s
zJuoC?Y9|Z{#uN=kPIoZ3#KI!Q9VfEmM0T9W0^=bYz2ijg985FSkq%E10$B#aaUzpC
z3{gSbBcv`<DmCH!P!1l33&9X%S^FF(vQSewPUKl4X~&67@+`-R>^PB$`$49~aU$zE
zl81F2C-Q;kIFYB4(B_|Se=gdnZ#Yim0@HrJwzV$RaUwfTWXFk2nkC1H>^PAT-gKPE
zlteg(=GJi{J5J<+l!q|^&zUUZOqeu5i7Pu#63ACTC+#?qRfNlNA{QB$Eg|qMQoKY9
z>^PByGQ6ZFAc;1|iJaAfnw@G!W5<@c{2nO<y#yz+Lul8Ru?fW%eogMnOCq#iuIt`H
zM$H$Y3x9uUHLarSG_9t~UP4O7DY_!@D#x%>bR|*<N~&fTDaiSVP)si7DN-dWyyO&J
zouaE#be%QhJ*VgjZ7JC(oT4j%?@)xQTfxtZq~*2W02(SHg+RTRc}8>p;4$_o6}_@h
zO(s$asY43fNjdlQ&1G-*p;;G)L3&30xO}58@9{V&%X)5B$gQuk;rO<yJ(k<e8kej|
zig#kXmd~X;t-(IY5-J6yAb!~&;hpFfPw`pX5R?L`!r>V5>PA`-LT*=8lTC<^RoIQU
zxZ+)(goBQ4n|-_=ooSD#LN&ih%_fyQ^y>LttI(a*`y5l>A?W&(0W|E4Af6Nh<{>dY
zR&#X_OGc?6-UL_$77P|z#Wq0_?%@PFlp?B%5f?e<z$p0|(QL@<iNv0)CXS(sTw^%o
z4(Lbpu&7*?rKYptZ$n;*kB>*g7i&LVl}LI8!va}T6^43X9}|ekvIvjirFAaNigDo$
z%cKE%FO3G_{mN5tgd7gKM1+U+ZOkj<R|&*2I2)PfDZiJrNcvi#4;Z8@7RipU%0G*e
zR|OTYT<1#06TyX&B$fBRKR|n+i^)2^gJyo`hK%W9Hj_&T{LMl*l;5Lro2W?&kPT0O
zmZNY9^KHtK-74B#NbyqgEz`r5clA7~UaE{+D}WsLp_z7#?0b;bjQwGP=W>#n1=*o+
zXsD`{X5~VVRaCNNx@hlWX6SkXG!d$es`n;Rje|2hes*EzLT^{RyD?>c`BtiABI8HN
z$MTTL9)K@Rk_mLTi`yE9{pjcj@+i1bHD7u)oeB5XAJjOK7%b1bjTIOE9EvdSSvB=4
zFMD29HQrVZJQe?{hEby%_~|4LlKS@g{_a((s83&Ng?>`Mi5VTp-EkWgLwoJpsJ`55
z-=?q(U;ADVRBr^sRv@CU32yiCD20gk-BDtM6F$g3<VNRtvO2iG<RhG*ZcC<@89zBW
zWK%M#z~!;bSLmybWhUlkh8ZK&iAQ7f3tr%c{#6~<0n5LN2{s=1;QKF*r}wf7o6}P3
z(-8uv#O;*0of5ZG;&w{hPKn#0PUe_~xuWgY8g+6JI#pdQ%G9f=bP}njQgk{W$MA)M
z7>ez&r#jGHyC1Y4jYqVoP0cfi1K`~B!uDK4yyzic@!m_4$&jv#=!@r;G&~ViQSAGu
zDTfgbC3x{0AhByLxgaFRtSHQ15uZVw*G>r0_!N<Y!N|w1-w%ecv*5*vbfH*&v+q@C
zc?8AzB!QWU)*Ja?`97Q_zk^C{KLsu&Y?sXb!qpWrIK}upGwMolHX8n)_5+GkQ&Zj-
zZF^&hYM9}G@q~kB@g_}`I!D%Hf>;24sSKj1DPbT0HGNbT=a3*?6Z_ty;4Hx_tCzvP
z<8#n&*`4hxs8TmNm{Id~N}0#ZQ=~g2orTT}$3X&dNbO^28{0U-Y3S=ijNb`D(-76T
zh(l<T8gL&QC?zv@Z@|kF=exS%m0i+Y3;D9QIBLq|`Y3|lx=~;@Z^;+tJSyvotK!-q
z8EI4k(}D4TQG{tCZ&`3<@mc1f+8GQgnlwcx&-ku3t^X21p!{XWK{sto(r|#`(r{2O
zNE?2~AE!ZVoMTKUT+13{uG*;1&47U#c>j>ZqAP7;PUr%i<Fcygn?qf3)DAH=8pD#8
z&L_C}aowAJzQ!fwZkcy=4_y}rpG~mPl14A~I~pmGWe+Aty_<$*%3l*dhZJdSvBXj6
z=X-NsvG8CM8`Jw{t;l`66mH?bize^}V@6-(ivju&;YSZs8lRd*AvT8nrh0!|!?%M0
zCL$vbV|w}p&oBRdb>)FLmec+J^7rvVw5wL?4!yO;5Of`NE0v1=^+mDOPxT$l7|Vv|
zd0$Mo>1idOw^LuOZmh%KM3!v&H{M#QdsYr=o86~BW*-*j)V$b^X{Z^D-D}&MyX*V=
zo9nYJ-QL*W-df+zA6;x4TD5!8<7W0xTXBkMo$d7#*#f!t3!9e5D1IkM9c%sE*n9aR
zx0JV$wf<$pAv@TTZ;aYl-ci~JXUo0K@^R?vuM7G*Ge=NhI}@z`g(L@lNg?ZpFmYg-
zCTeE+UQ7`j>~woLg6@qFCCsq{<~y>L^!-x?5GTm~$Z3jTbK8$Rlryzp#&tuSs)O0*
za{JU2foHM<G|0F=m1E3$N}}0e)P}(evxVQ0rs3koF0Hz)&6;;-l=35(d<>4WmUs|8
z3Shkye@feZ=s$t--Eb72VWPK?s=^kCpni}4kDSb-AQ)RikWV>0lmII%+99k=J{y*Z
z-$sr|jZM5}91Q$4Jmte+sgGiGSC&(YCsYncYx2H>RCpollD(D?oqLFJ!^Vh$7=4D0
z#1DI;Q3s=gM}{!v@HR=?7x}_*-<Pp7eF4h^X_pmw_;lGo9^1!hPwE0TYjrh>U`Z$5
z#_vRf!Bf1j@d(O!nv-P><)18HJz3Upy!@#lb@{Up7C%{jP`(-8u)eja7KrpyeRrF&
zf9A7r!qsbQVw~iC+}Pk4ya$I1gB6DfEGkWlc-3S$qG8op!R?X+PFj45K4=B$;~+3&
z#opoe$LT^#ne)^e(IMgZYf5BieI0&GFvcBpvM2nx<Z<i}ghRg<96*LuoT9M377lwe
z4P_%5+KAdX>ap2{>=nJCV?5Ju;qiEK54jH*=$I8UM>1>m)1O?gc~(JzG=wEL<)A%)
zd3(_OZ`swF*g3G*YeRUAVH;jY4^%_F+NXH$>mTADLEKbB3p36>%uB*yIPmG0Wj%M*
z;~IkNsJ^Amcp@VOqjKP(VbYbb(I;d4<@xP4whE?LX+)-2G&S!g)-V<%b7(Y0;5?%j
z`08r3@E*l%GcA6M2L`92*)!x(J1zSP%?cJIJu*8C;!!oy30@vMQA5{09fnRY>UJAB
zm&q|MiJ#c*njJ!#u8snndRW3kU;HWN<GJ@;l)Nt+5c)5U@GVX>aoO8}JTiFV561(N
zY|GxJHX0^v`k2Y%Q>^@?hD>+r^{~%G`fT{0NE+Ifb}_5PQqJ<joW<lPS;dOSYn?=N
zD=>sn58H>{)O7H0gj_Y23f?pXSm0-OKfzYm58*f^ld)vso2@Rq#}wObCW>}xv*RUD
z&W=pkktr{ObRT?@cj>O*fcX$fDzhpsXM=RS^<{>Bq8{)!CH`jalS0TUhh#?PTto6-
ztPud(m^;S7I`E_31bZ{~5mHm)Sj7f}pwQz{+(~M<ukk|FZRcYI7E-@6ih`A=@OU3Q
z{~!PR|GXprA(inr`+wN~)&u-;A;IzY`#*SlSe1`QUj_~DEQ$29&H22*8pPvq+8qtJ
zwPBCMQ6{JCf?B$Eu(Y&M^SoQ;I6EGhM4_AvAq2cS!8nLIm_#AA=%qz969%w%WAr#;
zh)<4|atK&jo5^4=teJnlg=T(YO!#IF!r5#rsNdeb))G#U0LFHKpn5^A`FA+F<<wO)
zm<cb#{-Uk$iv=}G6JllmFpqH8mbMqvt|LsP5AAH94QFFKO~;9ksy199QL~^{?-o-)
zc_L+L2XDim@j3DK1+})8$0Tb@yJ+Ia;?0^6qT#-y#jFPRf?By*TDuy&;EI89fcSfd
zJuZ$DeDqONTUvsfw4j#$=NxRAyQ<pY9GxrkwbbSW4$p*HjATLWUQ>W4M_O8kl{GJX
zBl{GiHw$X!!78E-u;?(%M2<<gh;8O%U<d(l!C0avyr|(8dinflK`mZ^sj6`s%**R%
z#_lm~wx*AmMGI>n%`pZPWZ;X3B&}@gULruTpaudPI>Wo^&-IJN2J+ke5Se8Osx7F+
zf8rHF3AH!fh`)X*hoj%mMO(htU+JHx<PuE&%lE2I00JA$Z9*Y>K{Xx?;Fu!H79#Ey
zrSKuKg^TRLTUbz>dodl82a>b#_qUeVmhep168o)<*8WNKz7FMqXRSBsE(8+DtbWxN
z6%?YYa~ap03|l$@h$cWaICY%>goe0ZPy@APK`#Ryaa)bdfoBMr;qmN(t!OHIr&?J{
zhvT)|h^Vj15xKgxx_5n*fsA<4PS+3&pQ|lU5jY351rpq}jeJ_eOy&%&UwCdkYk^4`
zvsJ1nP_H~h)}>Oo&ig>bbb#WG3jbakp2@vy(_c(b3zgO@lA<Wy06S`~7}6dMCc_AK
zLQH?un*`^6F-gy`HCK{8P#V2)YQHDIjOjdC4s>LV1fuk6hfnpkAyPV5Z$X?*P$|i%
zng1&e9y`ec$Rq6|H}pz_WG8?bnu{7_zLwOA<EbxYhwH%ODA;THP#wq~R~jn&B^jmz
zbrJ2gIYWdym?4_`B^jOrWK(r`=H$O@T5F8cm98EkeI{7z`EgHh41!WZL<s;y^RjuO
zqCb(;Ru^?7wf1NYM_#gKdt~i&v$p93Ae;b%6M!fng4*rf?cIXegKbiis3znS6q2U(
z7yL)?Qo-u6<R7i0merS?L@)Shog2Y1ffKz6OGc&QGL`y9=9-ZYM>4c2PHC2r^y4St
zX>Anu*8Enowz;vsTSJlH-p*080ThX%>IiyV4Z1|7AW|R>tfog%%iZorAreS9vS^?<
z3YD7Kj2(rF9N3OR#k=i4LOzZd6fQymesxUN34vC|*%XkWF`3nC*UK35L;K~1*hadG
z906wniUTJA5eGxx2|#cbA>8N$AT(tqum?fv|2_&p?6un6q9Ni0Aiib-5Yv{XD|T?D
z_^uQmrqsx0iKr|Gq^b2(09ET6Pm1Q1I8y|P3TCq^{t?VFW)Oe*@q4$WP6-@Jz|E(?
z%Kz;8%G`3iF9RMvC$N)SlYCt%ki{RgBAoz)6M%rvMK`?KLXsR)LSfqs$RdCZI~mYB
ztOU3r;6?J|&SQiR#a}(fcToy$aMc7g^DYuT5Lo}zYSIN{QGKlGMVt^w@iQcAq@Wex
z3j*AM%O3h?S#4|qz9IBVcyyUWO_-Jllpj6vP?M6de8yBDm?~p|zhwcH4)TU^h$jV|
zrinUR_q$X%n@6a>)}{E<HEPp_AgT!>P_3J?YK&LA&nqSNt}PpS-)l8>qr8EQR6(YX
zrBIx>hKp{Iagc2_vimff_+_JmkEE`**}Py^Wz(eOraa~f#cvoW{_3hG8<b0KYi;kk
z#@x!^I+v8+?62oc7I7v_n!pJ_gr5gy;CRrYs%(5hzJW-KP|Qx_1R&0xYo<C&CRNP|
zKwvsGh(JZ*LOgK-5K_Q)BR|jN--Svjvr{pt&|@JPm_!=gq%*dgs1ty|t~458UrNA?
zNdAXqmYl3K?pK#_Z43)2AX5l9NC7jFnv|<vbs0Q<$eV+KLqbLgKDuYFNGVNHrQMVU
zy|eo9XnfaxcF$a_YGqA28aKxHX&MZHQWL6067j9CZ{Qj~!@u^y;DRDr&1ffeu=K^K
zfk>rHr-5P!e~_NlH)`uE_#T>-Vt*atmRuWhg)fIQh@f0v#=qW|e^m8@{s`4>^+UNX
zf7e(Ob>J<#(x|USY1CJ?BI@C9|LC1}asXuIZ~M!4Car$;PJiW2e)&bX$cqcWw&&t8
zpB*g^X@_*AT)iI*#*$$1qvSD)j**j0v@l=Oo;iGvrw+=Lqc@ErJ*U-&;+d5&gwk$s
z37jHn3$`Stb>LHpo2GTs_GNzhrrZ3+kgs=F9P-s6Uq88Z<A?_(53>jqy7lEp(dh9Y
z==1_`Vfk=K@hMI@!2sVs2Jsk9IsW4o&Q^Qc;wtYUEb`MoS0g}qHFki+n?#Qim`K+@
z_}Q&H3?<C}e$i;a;d=37KNc&)bI8}&h5|0c@hVjsIG4F8qY&mS<9R0wH>l@MqA7H%
zf)yR|)xho)2<VWn4*B}h@MJoVMB|v5WI|f!LUAq5(;;6S@>OEMI(tkAp69lpj|^cR
z$!myXLV07SL&PAYHO3&6f-?+;C$O^+^`xO=FyV}`@hc`7JWaORI0MAPZNG;aL?zz@
zV9J)tF*PUfs4X1)$W>%N&>V#kl)$M%NHbh~l4|nLlW3AiLS(8@xQM%t#(EajypM2G
zvgvdJ@aUMbQSUaRG5&z*jX7LE`sErpN%^H(4ppz2-yM&Tl}3~+ZUP(0pt&!V0a?d*
zc|o{Wz4!s<uJ6M)I5u6|WF@3|W56%*_CqbK!s~%BC>D|&O%8^!*F=5?i5>7RNFh#=
zW4$j+N@K+Xr!{+~@dyt2$|-MaK(joH3>K(`B(Z@6^-KvSQ`T4kmWa|&71Lw|#R2kR
zT1~1GT2yc_wkU)pGq6w6Vm<)rI^``A?-4M=sni`x0kQ_=)FBTi!!dttfQW_EITw-^
zs<Vn9`{|)1W%L7!NJP^mYUz-#JZqhJ;D2z)*ABN_=*@`JgZM95<G`DTm6lT%7$QeF
zr%pVuMB}Gchwg@m;d_&)>w`%z^kc*bNDogkLz)8)`O2f8#!FtVkvQb*f#;B~4*5Dw
z3~H?_Y$%TAJ1U|i&oT8bMb}Bw?`9}*y^Kl|JhNourC(wRTf`T$@GYEox@j4fb&E%$
z!_}0A8N)NHRqKI5bkagS>TnHuAN>AWW|6M72BX&6_TKL1Zf9e^gF?2w?*3-`+V=kT
z#zwc*+3Bu#yBpi<>ziBt=FXblI#Fg>+oIa|3}tVyV+$Zl8s0SLm9ROCfeEz*Ii`BY
z81hzbgUGquKZTR0ynO6kHLvx=BG)Tx$`K7s&MEOhL;lJ*68KFdU?}`ixS>Vr!Jshw
zDCpGUm^$Puv|MD?WkXT}H!vzB>7Idxx&wAfH8cWDE(8HEV2ESLXsKjMu0#{Wh=^HP
z8dH>iANBY$iyMWHSf&0Tp9F=E>iBxrlEQW157NHj4hG*6LjzWC26snvoDJgGn;^_J
z#ktjpY=q7w_yKC5Lc?g2-`5cQDq0}t@D<M9jObTi8Zi>ZZ{;MyqL^tNSfiE;U?#wL
zoY~qMyqf_J8rY=@Yr^rj3ZnFY?Bun-LGvNRGLboHgGmU?w1Iz!8)VgP8e&MkEO>XT
zzM@!j2@U~lXRFaa1@E!n0t2wVj6=SDUC8$@8<DaUG!FUdkgw*X6jf#muaqs5cn@P~
z*fW|c2aIRwft&&AK4Vm?$=z2?l4jH1$RS@vP5@NqMSQCS0aaQ6LIcH5N>K08C$>ig
z{3+dP!iH}Igk2Wk9uCG9Zp!mXn-4{0p5WBaI(V@p_yav=H1rAn!vj<Y6)s6f96(n?
zuh}jrKC$%j_5<_&_T-dbe*$)76;6O&d50$T$ub!DfD^=V*8yuWikosvMGb=)xdQW6
zH41{&_>&JsUEF!aUsbR8B}4(Pdha9hH~_-*GIV>DTdxPDlYRCTA#D{(8v-RuotjZp
zKfH&3d@O(0kgIBpy0Bp52vF9-zm|^-hzP1KgQR}{0Wh+kBtcvz7Q_mS3wWaC<3Lhm
zRuUc`i?3wA@+4KRdIuVIpkW6ZE`fK?qcnYG=<XkUQF+r))^p&u-zR3ja^O|ElSra3
z*ncX-G*k{2)FRKTOv4h{rkuoLLGAhxdg+Ubh=Z`deok$P(KM~`g4*_NEnTcUefniI
z;8!cZdQO1;A1g}#*Zp7j|5x$S-#`1&j^Eu1x*Ofq{k``3>UMi~eRaRJ*IM1%-)!v!
z`v^yFuRrhq!?Qn|9_r`)e|-A<*|(&dpFjKI{9e|#eV*ii`oHi0L;oKQ*8VsBf671j
zP5;*g2>xZci7#HrZwwj!&&LiKekmi(aUwfpc#2p?>Nt^w#?~Rj9WvZ;B0EmxMDVPT
z3k7Ah<3vtqp?4>Oghp}>@{~~l6{Mj_i*PoTH#WD(?Zgv;YGwW~IXb!0qzsH^6rl#l
zsFFj5J7l=6!*j@R10j*6<T#NL%%zyHLx#(IuviJ*d>t|z(D)o+m7zJNP*si-8H&4S
zK`e}I4jJx{;Xqhk2bfR`h`6t0ng9SwVU|F1oV*qY(H6cVMPUvZelSgi=W^g*)f~8?
zUf4zv^Nkm%Z#Yh5$BAqpsty?*vntvl!wHUl5m?_L!^LY;H^m{t0atgN$izvOxNimh
zIFQ=)#<iBtm3ctQljYAE0QWyUL5v94Mf3M8gmSNVi@~_>TU2GOhWZz1+(0`Ag$*g|
zaDa1k$Z*Gr47lJh{G2Y3lsX+|_-)R^BPgY!whv5t&@=};u$GmfIAnMO0BC_+mk`-A
ztG9l?qHDKuwem<7F8<IJTzN42Q-=(%%x-HA86JZ_o>hx8yzjEDmxzoFkH2skOQ7fY
zqZ=O?SiIvzz6>FuLxvli_!~urzZ|IkEyQcR2wnL5D`ao8m(z5fj@JqWzfRHBo>z|7
ziq4?pwQ|`@l<5Om<an+8cHaZ{UX(Dtih4~+*n+(z2?pI&uu#$k9YuEkC>3~mj@K&f
ztN-q@mlj54=?pF+*wlVQrnkr;nOsEFBhfUFoaE#k8C8#=`H*Vq>6=TM*#~n+90utb
zNz`-hx)LCe93W*qH!I}U*U}+;TRCsa?Jf~ZYF1m-=h8k2-WcPtOQ=*M4ho?DDB@dr
z-V124H$AY<j$#Np+MwlCt^CgZt+%-1U7v)5j%^!Xn~(RSGwl&YWAI@l6j0I^ub$tv
zx?=BhjL2~(==u<%$prC8(>nE7yejKa{gk^%AIW?Dj%_8Mbj3sX^Wg-XLuA=yZXfpr
z0sx+Mh+i3i2xRtz4pgFH9K%q)i%yd};6*@nAS##TjVT-cHj2#ocrf?AxB~vFaKKd1
zFCcf)$j}4(m_R6XB*=yu1#hPEUIS0)h4LqQFZ~ejSDu0+M10Y0Qns#dV_q4*N)Y0+
z>VW0<k`_r+3l;FefP<6?(rqGWU42o+%gZ8NGM)uW8#ci6-uDL-46?lU`JEfQwRe~?
z<CJ9eHf#A8`8`G{g;Pi708R$*OGTXl<J>ZLAZacbTdX!6>bx3I!kc7#6)pgIS5LTq
z6fxBb2BXJ^Mu)MHea{FS;klfMW<ho+92zQjk6F2(r2-v)s?!BeM|&)L26&Xb76`qD
zBg9uls&R0J$4kJbv@o$g`gSecjluFPZ>35mGJXWkmWNFC0KEDnnfL>{1ZDKt>qkdN
zNX6hr)qLsIbSB(ie^A3o8)2|K?>1Ik_;V=2yl2nest#IJ<C!Z5o>CxG!>G{>M5!sM
zZ?Es~UL}|0^rcqlC-s|{(Sh9E^QsyReeK(5EcLZ-Q&@(teJ>bujbPXcg!(GM?LHo*
z&=|XGxJ&)P2ib?*=sZtW2ltnJgdwJ-dzd+va)6TIT`7;HtOyz7R~^fY(!vZgMyL~y
z#^@KkzzzMYO7$(sZG{o&$f1!{98d3Mvq@VHb&~BUg{lHQ$-t(T`VMvCP$y!jI6gv@
z$lw44zS*Hp;-GubRA<5Q5y}qgl(@ayVw}Gh_a+dfNa4PE%F<@!X}-#+lZ())>T2aX
z!%3u`O3~?j9K#n9{67ls-~z%o%PB@#ig$_+S%)_m^}_aCL%irAU-8~clF3jCQi!F=
zEopcns-oBjjGX=lP){;P;YC_YE}*_dB0QT{#AnbWKO-p}jC}0+{a^??3tpUVdL${E
zeXl~xBPh;`l&yF%MSBppLqR9xV;Y^XT{8O%S3&xOcmh7pjJi^l0KK(Ok%5LtHC=CC
zNL?)hpxCCiz_D<Xrix-9>oGwr0KWwHDqaj7#y|knUZ^b2T1be*#lH6_I7{%#>SeI+
z_#E_Gc4zwvib_ET2T$l<QRXr86zL9$?`}wmK}xHAOd41dM>q|AeTb1>2a#%sdQHS3
zv`G!Pj}0iMo4Zr$R+P}ID^o5hdG3@E?y|QyYRcsL$m;Tdht9kuUzqdAj<T60V5`HN
zN=6!$z;s|dU=%_5N8YmF%Hp%kp0qO<LV%**Gvm7oJOKW9$=57S@!23y{<7r2YfRD*
z`aUKc1m%>5-|@#O$Y-TV38oXSWeqY{ttWdkV4w!xzZ$-Yi8-MQbdKvJfotq1$*3J-
zY&sguG65{ZH>i(7Ok(!=8kdl}W!}|2bX^>LHUS%O(&)v0$AnS1>|;M`I0I(ouZf>S
z7G1Vj;wbd<y}7Sgc(93$X)Ci<<UU>sw{YO)jJ)|^^5*iXX%t=&`%U%!xQ1^B1589l
z9>(<a3oAYUeHCiWat|BYc;)+eVb>eiO5LHi))<1Wqi&^A(Z9YZw)&~QgBfE(Ed0gv
zoOoKv=k3&2s~hX^H<2Zq{*AX*>YkN@+Gh9Zk7*(n<`ki8*$-vBzz(GiHG{ExZF_Tf
zeSd#*eYU0B8~fW^>)ZJqifu!yb}xF|%>L=v@=Xz|Y(BDm6{@QYKYdXc>7pRKNV;=N
zx%n2d<-cq=WSVw)8ZhB_xgpnX&$ZIAMc#IMu-yLY*uj}O0s{S+VEr#7Iq*vg06v6?
z1Jg9o1JL&f5#>&|ht9Y+LX?npN*s#apC6J&V_`I*f&xu01e*;_e+y<@H^iw*Es3cl
z0k=<05!i$X4Kms$<ruRjFYt3H43cbAgYD8ZTm-c-j;F27ns;ZE@*}~U<2~_T(nZ61
zDgKnU`_O*^#XW?>&oI$j%oX9QBbbNyf0#gzf?#Y7K|ba1Py(#5Xos*e**;hzK4R_0
z*hKs+4j`Erp7LR^)JL(*9TS;{vYT3kdEY@QypVOtUQ39GJ;b<SV?+Ve3E7c&jUK3k
zF*<l8RH*ni3GmT0)quK}u`_)E%LHkcmAMl)kL}~MhrKCi^wrgf^!<sq;o;8`rN6hS
zr@h%VY0uMUM{mJAT3ywcfJvgQ8C3%VF`h%vT)$r^D>zx!a6Bpu@&W_=SqO`tEL*&?
zxf$QE4z6mkK$N+9cbl<)=Ck0<)N85;lJ{|AgX{D?Se+QGI0QwlO*&PR;fRJ+X9c%Q
zpeQkkPtgahAblJF_sYzQ^bTz^P8V9roTuK14hhF!QzAR->+oZOai&RudIxG#Adh2*
zARPL=;NZYp6Q?L_uZ6?jOheg-hBl%$j(Ti%A$vt{=oruRTX;O4+?&AEc#N5<b0o7?
zKmEz|S_YXXMQ+JWnWIxF<_esm+7Mo2*oN29>+zqT;=Qkbh<`u<Qw=T5IQuXUQlQXo
zpkJ2t+*OZjh<pH=6f+r53Z-y~qhT^hVWUsR_)9b{CrB<O0gcEMi>Bt?#2S|TRIX>C
z&sIWr-lKS0)8fZ?U~r6P2J)z#mVJdxE5i<hcvN`+OV>UfhE6c*c9rcy425G{5<juq
zH9LefT^$7z8TG}VVm_W`#ii^9F&^^&;RxU2L@O351y6kPK1*&Fvt3;-OxpA@lgFpj
zpQ|C$oq9d&Gx{<8PfRx3(k^DTSjt&`m|G7Q@G4e3Uh5>HTY({rde}bnriL+pI6|&k
zScf3=eddW?hbAzS2@2Gag>SaH@E%iax0$TZrOl3)Ksh@SXGh}fNSxm=iSzO3%V6LA
ziz3zTH~W=-<=GFWX^52|S7LVyYSNmRs;c$HEg2`L9O?&4OB*%MyJb$XV@n3Kpk^-1
zK@=kW&<Vz1)DNO|h!bH6b}v*V9!HGqvH1&Xp68V#&eGaUFsrt-wV-yri|+ZvcnnQ3
z9tAzNpw|2)Aw)u90(2AXnQT(w#3nFoLCyN1IA)kmYIMNdqiJH?!UeUdUUEnCfO>6d
z8$<e`neB6;*yz`EQQO<Tbeu$+)MPB?h34i@mUi$qh!roWm)9;4-e1~96F(Lwc2T>_
zj!w+E5xi^-X*_~67alFb?wtko=`Wb$6F*JpHfu{ue^r#Y$~{zVa1P3qc~kl=GaHkI
zZ3~BthX}5dE|IhjTf@AljZFJ;&}KoM4}BREPGOrl85lVLN-&n72``=#Ho-VuKU+|%
zcaUNkz>_m=wkC+CG#N>=k~gJ8V?m95mk}uIRU$57PW}b8=E&wS>5ykhsnwv%s3YJB
z(o%JQ)s>gHpO^Tj=<KBSrd$2S)EEBlU%X}N3qKDKmqO&s_u9!RdYz#RPF==@upze*
zJH+SdPf;_0JuXEWUR9f}!U`G<zz#M>+&L%<L8bVF-^ZTQ2s?Ea=JHa5vM=>R!v51R
z;?E&1+zA>u`Gc4sh#-dS)bu4*7|WBRw0{2>&wroIh|*hWFsxEsroPNp@fmE~@)c30
zq8V}u#CiuW#+PFB3fPAba&BA)d^-f3D~HD)+`ikmdF#Uuj*y+S8$OrqbTjxXG1eJg
zw5aVctt}&s`nJ|7ze-p879UjqLWTlP{j$m6kMZg}S`u&EIf*)5WSqwx<b8o1!Q82q
zIuqGb&Hi?}wLZR4U0L~E6g<Dm8`;TIVMrMYZ2=m8KS?3@7lTIvp~H62^4pKRssz)J
zW90V3xr_8h0%S7-ni5Ce;UVQ6!bAkl;Yj10f`MoeJA}E+m87*VLzD|Z)>^)a{Kie@
zSU!m35hnPZ5w||v!Q*o1c+2^(m(j2_)3OTYRpWkj88`%4VMLgqb_LV?4zz(W<*LiT
zf1~y!>Y2nnKDuYF+(m$Pm{bw@N1V2IRzDt%@7mAqnTvMqKniXwi)V;HsX+7z;9g(f
z;G)H6_}4xjhvbVYDJONHyJI-Bsgz7WOc*lc=~;cFw!WgxN--sdxaDy?(T2+6$1uRU
z;$<v+tYQ44svq=6h?3S1<+}V`V=N8x&V|(8{*aoE=M9!g$$Zc;edV2AkDM{Hnf2ul
zo&;?qu4FI9#L3KrB;T7jX=Jt+S|HzR=+VIBSHxLYy74g#Om6Vp)B}A8zG4t83i;tL
zq@+xbyQf%=4@8veoBcoRf9nC%slHu|ll|$P{&)J{EnfLv|NH$P^k;)<{VTb6RHYyt
z^*f9m&e6}h(*I5WzvmAwt5&_yM?c%1b9^k#o*rwccwB=$jeR*;Yd=SOe|xF+uJm6|
zEpN87p8ea;*G|v+*UmON8yio0dwUy$<VlkBqs6xB-JLeJ<@H{)cD$Aj$9tWf_Fghx
zY@62BKL6;y-uqzZ^L7|@nM+6cHEe;(%F^`duNM|Ntf*G%*3ixojFG&3*2u-hMYe86
z;ck7uwcXkGH&*w8V0(4Dga7e&{H@jfjke#}-rWv%clV%h_wfII<DGY?Y4`B|;g4tk
z^HmnxdDDIVgnP*$zp&H4mMVMKmYkpG{Jhs!S{9=izDPB5rVl^)<kpQNm@8sDzp|m_
z{Jgit&vVg(%R~<f@dR^(7p@e}a2I88Q3e-ffRW2+5zEyOvhqDj%@Ge-n|7l`2iZj#
zT$JI*e}j<5cT7Cu!qCP!V(;HXVC7AxX3lx^k37V4iAH6GV-JDY2$hV8ezL?;pydIe
zkPB+tUu5xNtAoRg3t4Eo3=A)-RYrdSO)-G`R?zkZf{gH3b3v{8O9_WJ5$tO}l5y0I
zGN|N*p#v)DET&JNU^<1!?=ykKG?_(9BW1my4!eqZ&H0sgnPRWU;TTxBX0y43zic}f
z)W(0~NfV_tOCZH%i7=8W-32wLrrFI=v`ii-!1D<>nC``F#$8=dEC02fWN780SLHVr
zwRQfhnXU7U=l;(*VBQ6$R2DzheS##p*#Q<bIevnoQ>l=$08;P$d%x67w=RE~(&GiS
z?m#ldO`I#O7_3%cFhp84MZw6@lEoXaeaM*dI+CDF#^`Rfpcel}Eo@)9O!#y`P5fcC
zQqGYS!?p`*(|3#65eXMBs3Ct+%8M+h5&x3;7!w{}P!qN<n~5og<rmb}YL+7^#BbC_
zIA$@QWN2BNwxIT1SHKT<s%&#5`bU~AHiH%vc7w-WJbDbPkdZ0tccbucvM78o=n~Lq
bI{jLN3u1^N_hFK`pdK`Ae{~UbliL3W{@Oke

diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.json b/tests/integration/fixtures/recorded_responses/invoke_tool.json
index 2878275da..09828b0c2 100644
--- a/tests/integration/fixtures/recorded_responses/invoke_tool.json
+++ b/tests/integration/fixtures/recorded_responses/invoke_tool.json
@@ -1,501 +1,604 @@
 {
-  "()_[('kwargs', {'session_id': '<UUID>', 'code': \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert 'date' column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot the time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"}), ('tool_name', 'code_interpreter')]": {
+  "[[], {\"kwargs\": {\"code\": \"def is_prime(n):\\n    if n <= 1:\\n        return False\\n    if n <= 3:\\n        return True\\n    if n % 2 == 0 or n % 3 == 0:\\n        return False\\n    i = 5\\n    while i * i <= n:\\n        if n % i == 0 or n % (i + 2) == 0:\\n            return False\\n        i += 6\\n    return True\\n\\ndef get_nth_prime(n):\\n    count = 0\\n    num = 2\\n    while True:\\n        if is_prime(num):\\n            count += 1\\n            if count == n:\\n                return num\\n        num += 1\\n\\nprint(get_nth_prime(100))\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"code_interpreter\"}]": {
     "type": "value",
     "value": {
-      "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
-      "error_code": null,
-      "error_message": null,
-      "metadata": null
-    }
-  },
-  "()_[('kwargs', {'session_id': '<UUID>', 'code': \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert date column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"}), ('tool_name', 'code_interpreter')]": {
-    "type": "value",
-    "value": {
-      "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
-      "error_code": null,
-      "error_message": null,
-      "metadata": null
-    }
-  },
-  "()_[('kwargs', {'session_id': '<UUID>', 'code': 'def is_prime(n):\\n    if n <= 1:\\n        return False\\n    if n <= 3:\\n        return True\\n    if n % 2 == 0 or n % 3 == 0:\\n        return False\\n    i = 5\\n    while i * i <= n:\\n        if n % i == 0 or n % (i + 2) == 0:\\n            return False\\n        i += 6\\n    return True\\n\\ndef get_nth_prime(n):\\n    count = 0\\n    num = 2\\n    while True:\\n        if is_prime(num):\\n            count += 1\\n            if count == n:\\n                return num\\n        num += 1\\n\\nprint(get_nth_prime(100))'}), ('tool_name', 'code_interpreter')]": {
-    "type": "value",
-    "value": {
-      "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
-      "error_code": null,
-      "error_message": null,
-      "metadata": null
-    }
-  },
-  "()_[('kwargs', {'session_id': '<UUID>', 'code': 'def is_prime(n):\\n    if n <= 1:\\n        return False\\n    if n <= 3:\\n        return True\\n    if n % 2 == 0 or n % 3 == 0:\\n        return False\\n    i = 5\\n    while i * i <= n:\\n        if n % i == 0 or n % (i + 2) == 0:\\n            return False\\n        i += 6\\n    return True\\n\\ndef nth_prime(n):\\n    count = 0\\n    num = 2\\n    while True:\\n        if is_prime(num):\\n            count += 1\\n            if count == n:\\n                return num\\n        num += 1\\n\\nprint(nth_prime(100))'}), ('tool_name', 'code_interpreter')]": {
-    "type": "value",
-    "value": {
-      "content": "error\n[stdout]\n[Errno 2] No such file or directory: 'bwrap'\n[/stdout]\n[stderr]\n[Errno 2] No such file or directory: 'bwrap'\n[/stderr]",
-      "error_code": null,
-      "error_message": null,
-      "metadata": null
-    }
-  },
-  "()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)'}), ('tool_name', 'code_interpreter')]": {
-    "type": "value",
-    "value": {
-      "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
-      "error_code": null,
-      "error_message": null,
-      "metadata": null
-    }
-  },
-  "()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\n# Load data\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n# Rows\\nprint(\"Number of rows and columns in the data:\", df.shape)\\n# Columns\\nprint(\"Columns of the data are:\", len(df.columns))\\n# Column names\\nprint(\"Columns of the data are:\", df.columns)\\n# Column dtypes\\nprint(\"Datatype of the columns are:\", df.dtypes)\\n# Sample of data\\nprint(\"Data sample from file:\")\\nprint(df.head())'}), ('tool_name', 'code_interpreter')]": {
-    "type": "value",
-    "value": {
-      "content": "error\n[stdout]\n[Errno 2] No such file or directory: 'bwrap'\n[/stdout]\n[stderr]\n[Errno 2] No such file or directory: 'bwrap'\n[/stderr]",
-      "error_code": null,
-      "error_message": null,
-      "metadata": null
-    }
-  },
-  "()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())'}), ('tool_name', 'code_interpreter')]": {
-    "type": "value",
-    "value": {
-      "content": "error\n[stdout]\n[Errno 2] No such file or directory: 'bwrap'\n[/stdout]\n[stderr]\n[Errno 2] No such file or directory: 'bwrap'\n[/stderr]",
-      "error_code": null,
-      "error_message": null,
-      "metadata": null
-    }
-  },
-  "()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())'}), ('tool_name', 'code_interpreter')]": {
-    "type": "value",
-    "value": {
-      "content": "error\n[stdout]\n[Errno 2] No such file or directory: 'bwrap'\n[/stdout]\n[stderr]\n[Errno 2] No such file or directory: 'bwrap'\n[/stderr]",
-      "error_code": null,
-      "error_message": null,
-      "metadata": null
-    }
-  },
-  "()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())'}), ('tool_name', 'code_interpreter')]": {
-    "type": "value",
-    "value": {
-      "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
-      "error_code": null,
-      "error_message": null,
-      "metadata": null
-    }
-  },
-  "()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\nprint(df.head())\\nprint(df.info())\\nprint(df.describe())'}), ('tool_name', 'code_interpreter')]": {
-    "type": "value",
-    "value": {
-      "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
-      "error_code": null,
-      "error_message": null,
-      "metadata": null
-    }
-  },
-  "()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())'}), ('tool_name', 'code_interpreter')]": {
-    "type": "value",
-    "value": {
-      "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
-      "error_code": null,
-      "error_message": null,
-      "metadata": null
-    }
-  },
-  "()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Convert the \\'Year\\' column to datetime\\ndf[\\'Year\\'] = pd.to_datetime(df[\\'Year\\'], format=\\'%Y\\')\\n\\n# Group by \\'Year\\' and calculate the average inflation\\ndf_avg_inflation = df.groupby(\\'Year\\')[\\'Inflation\\'].mean().reset_index()\\n\\n# Plot the average inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation[\\'Year\\'], df_avg_inflation[\\'Inflation\\'], marker=\\'o\\')\\nplt.title(\\'Average Yearly Inflation\\')\\nplt.xlabel(\\'Year\\')\\nplt.ylabel(\\'Inflation\\')\\nplt.grid(True)\\nplt.show()'}), ('tool_name', 'code_interpreter')]": {
-    "type": "value",
-    "value": {
-      "content": "error\n[stdout]\n[Errno 2] No such file or directory: 'bwrap'\n[/stdout]\n[stderr]\n[Errno 2] No such file or directory: 'bwrap'\n[/stderr]",
-      "error_code": null,
-      "error_message": null,
-      "metadata": null
-    }
-  },
-  "()_[('kwargs', {'session_id': '<UUID>', 'code': 'import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\"<TEMP_FILE>\")\\n\\n# Convert the \\'Year\\' column to datetime\\ndf[\\'Year\\'] = pd.to_datetime(df[\\'Year\\'], format=\\'%Y\\')\\n\\n# Group by \\'Year\\' and calculate the average inflation\\ndf_avg_inflation = df.groupby(\\'Year\\')[\\'Inflation\\'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation[\\'Year\\'], df_avg_inflation[\\'Inflation\\'], marker=\\'o\\')\\nplt.title(\\'Average Yearly Inflation\\')\\nplt.xlabel(\\'Year\\')\\nplt.ylabel(\\'Inflation\\')\\nplt.grid(True)\\nplt.show()'}), ('tool_name', 'code_interpreter')]": {
-    "type": "value",
-    "value": {
-      "content": "error\n[stdout]\n[Errno 2] No such file or directory: 'bwrap'\n[/stdout]\n[stderr]\n[Errno 2] No such file or directory: 'bwrap'\n[/stderr]",
-      "error_code": null,
-      "error_message": null,
-      "metadata": null
-    }
-  },
-  "()_[('kwargs', {'session_id': '<UUID>', 'query': 'How to use LoRA in Torchtune', 'vector_db_ids': ['vector_db_<UUID>']}), ('tool_name', 'knowledge_search')]": {
-    "type": "value",
-    "value": {
-      "content": [
-        {
-          "text": "knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 1:\nDocument_id:cc646\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 2:\nDocument_id:cc646\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
-          "type": "text"
-        },
-        {
-          "text": "Result 3:\nDocument_id:cc646\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 4:\nDocument_id:cc646\nContent:  from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n  # Assuming that base_model already has the pretrained Llama2 weights,\n  # this will directly load them into your LoRA model without any conversion necessary.\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n  # Fetch all params from the model that are associated with LoRA.\n  lora_params = get_adapter_params(lora_model)\n\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n  set_trainable_params(lora_model, lora_params)\n\n  # Print the total number of parameters\n  total_params = sum([p.numel() for p in lora_model.parameters()])\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n  print(\n    f\"\"\"\n    {total_params} total params,\n    {trainable_params}\" trainable params,\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n    \"\"\"\n  )\n\n  6742609920 total params,\n  4194304 trainable params,\n  0.06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 5:\nDocument_id:cc646\nContent: ,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
-          "type": "text"
-        },
-        {
-          "text": "END of knowledge_search tool results.\n",
-          "type": "text"
-        }
-      ],
-      "error_code": null,
-      "error_message": null,
-      "metadata": {
-        "document_ids": [
-          "cc6460bf-74ab-4d11-8d32-bc02144a4e79",
-          "cc6460bf-74ab-4d11-8d32-bc02144a4e79",
-          "cc6460bf-74ab-4d11-8d32-bc02144a4e79",
-          "cc6460bf-74ab-4d11-8d32-bc02144a4e79",
-          "cc6460bf-74ab-4d11-8d32-bc02144a4e79"
-        ]
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+        "error_code": null,
+        "error_message": null,
+        "metadata": null
       }
     }
   },
-  "()_[('kwargs', {'session_id': '<UUID>', 'query': 'How to use LoRA', 'vector_db_ids': ['vector_db_<UUID>']}), ('tool_name', 'knowledge_search')]": {
+  "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"code_interpreter\"}]": {
     "type": "value",
     "value": {
-      "content": [
-        {
-          "text": "knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 1:\nDocument_id:cbc88\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 2:\nDocument_id:cbc88\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 3:\nDocument_id:8892b\nContent:  with training with LoRA quickly,\njust specify any config with ``_lora`` in its name, e.g:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device\n\n\nThere are two sets of parameters to customize LoRA to suit your needs. Firstly, the parameters which control\nwhich linear layers LoRA should be applied to in the model:\n\n* ``lora_attn_modules: List[str]`` accepts a list of strings specifying which layers of the model to apply\n  LoRA to:\n\n  * ``q_proj`` applies LoRA to the query projection layer.\n  * ``k_proj`` applies LoRA to the key projection layer.\n  * ``v_proj`` applies LoRA to the value projection layer.\n  * ``output_proj`` applies LoRA to the attention output projection layer.\n\n  Whilst adding more layers to be fine-tuned may improve model accuracy,\n  this will come at the cost of increased memory usage and reduced training speed.\n\n* ``apply_lora_to_mlp: Bool`` applies LoRA to the MLP in each transformer layer.\n* ``apply_lora_to_output: Bool`` applies LoRA to the model's final output projection.\n  This is usually a projection to vocabulary space (e.g. in language models), but\n  other modelling tasks may have different projections - classifier models will project\n  to the number of classes, for example\n\n.. note::\n\n  Models which use tied embeddings (such as Gemma and Qwen2 1.5B and 0.5B) for the\n  final output projection do not support ``apply_lora_to_output``.\n\nThese are all specified under the ``model`` flag or config entry, i.e:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\",\"output_proj\"]\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.llama3.lora_llama3_8b\n    apply_lora_to_mlp: True\n    model.lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\",\"output_proj\"]\n\nSecondly, parameters which control the scale of the impact of LoRA on the model:\n\n* ``lora_rank: int`` affects the scale of\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 4:\nDocument_id:cbc88\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
-          "type": "text"
-        },
-        {
-          "text": "Result 5:\nDocument_id:9dcb7\nContent: ora_finetune_label>`.\nFor more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial <qlora_finetune_label>`.\n\nLet's take a look at how we can fine-tune Llama3-8B-Instruct with LoRA on a single device using torchtune. In this example, we will fine-tune\nfor one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is\n\n.. code-block:: bash\n\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device\n\n.. note::\n    To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.\n\nWe can also add :ref:`command-line overrides <cli_override>` as needed, e.g.\n\n.. code-block:: bash\n\n    tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n        checkpointer.checkpoint_dir=<checkpoint_dir> \\\n        tokenizer.path=<checkpoint_dir>/tokenizer.model \\\n        checkpointer.output_dir=<checkpoint_dir>\n\nThis will load the Llama3-8B-Instruct checkpoint and tokenizer from ``<checkpoint_dir>`` used in the :ref:`tune download <tune_download_label>` command above,\nthen save a final checkpoint in the same directory following the original format. For more details on the\ncheckpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive <understand_checkpointer>`.\n\n.. note::\n    To see the full set of configurable parameters for this (and other) configs we can use :ref:`tune cp <tune_cp_cli_label>` to copy (and modify)\n    the default config. :ref:`tune cp <tune_cp_cli_label>` can be used with recipe scripts too, in case you want to make more custom changes\n    that cannot be achieved by directly modifying existing configurable parameters. For more on :ref:`tune cp <tune_cp_cli_label>` see the section on\n    :ref:`modifying configs <tune_cp_label>` in our \":ref:`finetune_llama_label`\" tutorial.\n\nOnce training is complete, the model checkpoints will be saved and their locations will be logged. For\nLoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights\nwill\n",
-          "type": "text"
-        },
-        {
-          "text": "END of knowledge_search tool results.\n",
-          "type": "text"
-        }
-      ],
-      "error_code": null,
-      "error_message": null,
-      "metadata": {
-        "document_ids": [
-          "cbc884b1-9d88-4d5c-aff4-7a4b3a56618c",
-          "cbc884b1-9d88-4d5c-aff4-7a4b3a56618c",
-          "8892b092-6394-471e-b143-a23c6cc374f8",
-          "cbc884b1-9d88-4d5c-aff4-7a4b3a56618c",
-          "9dcb747d-0627-40cc-a23c-0bee2b6b05af"
-        ]
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+        "error_code": null,
+        "error_message": null,
+        "metadata": null
       }
     }
   },
-  "()_[('kwargs', {'session_id': '<UUID>', 'query': 'Llama3-8B attention type', 'vector_db_ids': ['test-vector-db-<UUID>']}), ('tool_name', 'knowledge_search')]": {
+  "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"code_interpreter\"}]": {
     "type": "value",
     "value": {
-      "content": [
-        {
-          "text": "knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 1:\nDocument_id:num-1\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\nThere are a few main changes between Llama2-7B and Llama3-8B models:\n\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\n- Llama3-\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 2:\nDocument_id:num-1\nContent:  instead of 32,000 from Llama2 models)\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\n\n|\n\nGetting access to Llama3-8B-Instruct\n------------------------------------\n\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\n\n\n.. code-block:: bash\n\n    tune download meta-llama/Meta-Llama-3\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 3:\nDocument_id:num-0\nContent: :`download Llama3 Instruct weights <llama3_label>`\n\n\nTemplate changes from Llama2 to Llama3\n--------------------------------------\n\nThe Llama2 chat model requires a specific template when prompting the pre-trained\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\ninference on the model, you'll need to use the same template for optimal performance\non chat data. Otherwise, the model will just perform standard text completion, which\nmay or may not align with your intended use case.\n\nFrom the `official Llama2 prompt\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\nfor the Llama2 chat model, we can see that special tags are added:\n\n.. code-block:: text\n\n    <s>[INST] <<SYS>>\n    You are a helpful, respectful, and honest assistant.\n    <</SYS>>\n\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\n\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 4:\nDocument_id:num-0\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\n\nThe tags are entirely different, and they are actually encoded differently than in\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\nLlama3 template to understand how.\n\n.. note::\n    The Llama3 Base model uses a `different prompt template\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\n    Llama3 Instruct.\n\n.. _prompt_template_vs_special_tokens:\n\nTokenizing prompt templates & special tokens\n--------------------------------------------\n\nLet's say I have a sample of a single user-assistant turn accompanied with a system\nprompt:\n\n.. code-block:: python\n\n    sample = [\n        {\n            \"role\": \"system\",\n            \"\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 5:\nDocument_id:num-3\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2\n",
-          "type": "text"
-        },
-        {
-          "text": "END of knowledge_search tool results.\n",
-          "type": "text"
-        }
-      ],
-      "error_code": null,
-      "error_message": null,
-      "metadata": {
-        "document_ids": [
-          "num-1",
-          "num-1",
-          "num-0",
-          "num-0",
-          "num-3"
-        ]
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+        "error_code": null,
+        "error_message": null,
+        "metadata": null
       }
     }
   },
-  "()_[('kwargs', {'session_id': '<UUID>', 'query': 'NBA creation date', 'vector_db_ids': ['test-vector-db-<UUID>']}), ('tool_name', 'knowledge_search')]": {
+  "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"inflation.csv\\\")\\n\\n# Convert date column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"code_interpreter\"}]": {
     "type": "value",
     "value": {
-      "content": [
-        {
-          "text": "knowledge_search tool found 3 chunks:\nBEGIN of knowledge_search tool results.\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 1:\nDocument_id:nba_w\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 2:\nDocument_id:perpl\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\n\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\n    Konwinski was among the founding team at Databricks.\n    Yarats, the CTO, was an AI research scientist at Meta.\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 3:\nDocument_id:perpl\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\n",
-          "type": "text"
-        },
-        {
-          "text": "END of knowledge_search tool results.\n",
-          "type": "text"
-        }
-      ],
-      "error_code": null,
-      "error_message": null,
-      "metadata": {
-        "document_ids": [
-          "nba_wiki",
-          "perplexity_wiki",
-          "perplexity_wiki"
-        ]
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+        "error_code": null,
+        "error_message": null,
+        "metadata": null
       }
     }
   },
-  "()_[('kwargs', {'session_id': '<UUID>', 'query': 'Perplexity company founding date', 'vector_db_ids': ['test-vector-db-<UUID>']}), ('tool_name', 'knowledge_search')]": {
+  "[[], {\"kwargs\": {\"query\": \"How to use LoRA in Torchtune\", \"session_id\": \"<UUID>\", \"vector_db_ids\": [\"vector_db_<UUID>\"]}, \"tool_name\": \"knowledge_search\"}]": {
     "type": "value",
     "value": {
-      "content": [
-        {
-          "text": "knowledge_search tool found 3 chunks:\nBEGIN of knowledge_search tool results.\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 1:\nDocument_id:perpl\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\n\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\n    Konwinski was among the founding team at Databricks.\n    Yarats, the CTO, was an AI research scientist at Meta.\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 2:\nDocument_id:perpl\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 3:\nDocument_id:nba_w\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\n",
-          "type": "text"
-        },
-        {
-          "text": "END of knowledge_search tool results.\n",
-          "type": "text"
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": [
+          {
+            "text": "knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 1:\nDocument_id:1b69d\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 2:\nDocument_id:1b69d\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
+            "type": "text"
+          },
+          {
+            "text": "Result 3:\nDocument_id:1b69d\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 4:\nDocument_id:1b69d\nContent:  from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n  # Assuming that base_model already has the pretrained Llama2 weights,\n  # this will directly load them into your LoRA model without any conversion necessary.\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n  # Fetch all params from the model that are associated with LoRA.\n  lora_params = get_adapter_params(lora_model)\n\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n  set_trainable_params(lora_model, lora_params)\n\n  # Print the total number of parameters\n  total_params = sum([p.numel() for p in lora_model.parameters()])\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n  print(\n    f\"\"\"\n    {total_params} total params,\n    {trainable_params}\" trainable params,\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n    \"\"\"\n  )\n\n  6742609920 total params,\n  4194304 trainable params,\n  0.06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 5:\nDocument_id:1b69d\nContent: ,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
+            "type": "text"
+          },
+          {
+            "text": "END of knowledge_search tool results.\n",
+            "type": "text"
+          }
+        ],
+        "error_code": null,
+        "error_message": null,
+        "metadata": {
+          "document_ids": [
+            "1b69d5af-63c0-439b-af6b-db5ec865ec3e",
+            "1b69d5af-63c0-439b-af6b-db5ec865ec3e",
+            "1b69d5af-63c0-439b-af6b-db5ec865ec3e",
+            "1b69d5af-63c0-439b-af6b-db5ec865ec3e",
+            "1b69d5af-63c0-439b-af6b-db5ec865ec3e"
+          ]
         }
-      ],
-      "error_code": null,
-      "error_message": null,
-      "metadata": {
-        "document_ids": [
-          "perplexity_wiki",
-          "perplexity_wiki",
-          "nba_wiki"
-        ]
       }
     }
   },
-  "()_[('kwargs', {'session_id': '<UUID>', 'query': 'Perplexity the company founding date', 'vector_db_ids': ['test-vector-db-<UUID>']}), ('tool_name', 'knowledge_search')]": {
+  "[[], {\"kwargs\": {\"query\": \"Llama3-8B attention type\", \"session_id\": \"<UUID>\", \"vector_db_ids\": [\"test-vector-db-<UUID>\"]}, \"tool_name\": \"knowledge_search\"}]": {
     "type": "value",
     "value": {
-      "content": [
-        {
-          "text": "knowledge_search tool found 3 chunks:\nBEGIN of knowledge_search tool results.\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 1:\nDocument_id:perpl\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\n\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\n    Konwinski was among the founding team at Databricks.\n    Yarats, the CTO, was an AI research scientist at Meta.\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 2:\nDocument_id:perpl\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 3:\nDocument_id:nba_w\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\n",
-          "type": "text"
-        },
-        {
-          "text": "END of knowledge_search tool results.\n",
-          "type": "text"
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": [
+          {
+            "text": "knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 1:\nDocument_id:num-1\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\nThere are a few main changes between Llama2-7B and Llama3-8B models:\n\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\n- Llama3-\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 2:\nDocument_id:num-1\nContent:  instead of 32,000 from Llama2 models)\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\n\n|\n\nGetting access to Llama3-8B-Instruct\n------------------------------------\n\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\n\n\n.. code-block:: bash\n\n    tune download meta-llama/Meta-Llama-3\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 3:\nDocument_id:num-0\nContent: :`download Llama3 Instruct weights <llama3_label>`\n\n\nTemplate changes from Llama2 to Llama3\n--------------------------------------\n\nThe Llama2 chat model requires a specific template when prompting the pre-trained\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\ninference on the model, you'll need to use the same template for optimal performance\non chat data. Otherwise, the model will just perform standard text completion, which\nmay or may not align with your intended use case.\n\nFrom the `official Llama2 prompt\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\nfor the Llama2 chat model, we can see that special tags are added:\n\n.. code-block:: text\n\n    <s>[INST] <<SYS>>\n    You are a helpful, respectful, and honest assistant.\n    <</SYS>>\n\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\n\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 4:\nDocument_id:num-0\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\n\nThe tags are entirely different, and they are actually encoded differently than in\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\nLlama3 template to understand how.\n\n.. note::\n    The Llama3 Base model uses a `different prompt template\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\n    Llama3 Instruct.\n\n.. _prompt_template_vs_special_tokens:\n\nTokenizing prompt templates & special tokens\n--------------------------------------------\n\nLet's say I have a sample of a single user-assistant turn accompanied with a system\nprompt:\n\n.. code-block:: python\n\n    sample = [\n        {\n            \"role\": \"system\",\n            \"\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 5:\nDocument_id:num-3\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2\n",
+            "type": "text"
+          },
+          {
+            "text": "END of knowledge_search tool results.\n",
+            "type": "text"
+          }
+        ],
+        "error_code": null,
+        "error_message": null,
+        "metadata": {
+          "document_ids": [
+            "num-1",
+            "num-1",
+            "num-0",
+            "num-0",
+            "num-3"
+          ]
         }
-      ],
-      "error_code": null,
-      "error_message": null,
-      "metadata": {
-        "document_ids": [
-          "perplexity_wiki",
-          "perplexity_wiki",
-          "nba_wiki"
-        ]
       }
     }
   },
-  "()_[('kwargs', {'session_id': '<UUID>', 'query': 'Torchtune documentation', 'vector_db_ids': ['vector_db_<UUID>']}), ('tool_name', 'knowledge_search')]": {
+  "[[], {\"kwargs\": {\"query\": \"Perplexity company founding date\", \"session_id\": \"<UUID>\", \"vector_db_ids\": [\"test-vector-db-<UUID>\"]}, \"tool_name\": \"knowledge_search\"}]": {
     "type": "value",
     "value": {
-      "content": [
-        {
-          "text": "knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 1:\nDocument_id:ab1b9\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 2:\nDocument_id:cc646\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 3:\nDocument_id:8bcf6\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 4:\nDocument_id:cc646\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 5:\nDocument_id:8bcf6\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
-          "type": "text"
-        },
-        {
-          "text": "END of knowledge_search tool results.\n",
-          "type": "text"
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": [
+          {
+            "text": "knowledge_search tool found 3 chunks:\nBEGIN of knowledge_search tool results.\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 1:\nDocument_id:perpl\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\n\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\n    Konwinski was among the founding team at Databricks.\n    Yarats, the CTO, was an AI research scientist at Meta.\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 2:\nDocument_id:perpl\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 3:\nDocument_id:nba_w\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\n",
+            "type": "text"
+          },
+          {
+            "text": "END of knowledge_search tool results.\n",
+            "type": "text"
+          }
+        ],
+        "error_code": null,
+        "error_message": null,
+        "metadata": {
+          "document_ids": [
+            "perplexity_wiki",
+            "perplexity_wiki",
+            "nba_wiki"
+          ]
         }
-      ],
-      "error_code": null,
-      "error_message": null,
-      "metadata": {
-        "document_ids": [
-          "ab1b9c78-180f-48cb-bbef-c70a4a59e42d",
-          "cc6460bf-74ab-4d11-8d32-bc02144a4e79",
-          "8bcf61e4-98c4-41a7-87f9-833c1a4d2b28",
-          "cc6460bf-74ab-4d11-8d32-bc02144a4e79",
-          "8bcf61e4-98c4-41a7-87f9-833c1a4d2b28"
-        ]
       }
     }
   },
-  "()_[('kwargs', {'session_id': '<UUID>', 'query': 'current CEO of Meta'}), ('tool_name', 'web_search')]": {
+  "[[], {\"kwargs\": {\"query\": \"Torchtune documentation\", \"session_id\": \"<UUID>\", \"vector_db_ids\": [\"vector_db_<UUID>\"]}, \"tool_name\": \"knowledge_search\"}]": {
     "type": "value",
     "value": {
-      "content": "{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\u2018Boz\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\", \"score\": 0.8190992, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.79099923, \"raw_content\": null}, {\"title\": \"Meet the Executive CSuite Team of Meta (Facebook) [2025]\", \"url\": \"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\", \"content\": \"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\u2019s finance and facilities team to keep track of the company\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\", \"score\": 0.7602419, \"raw_content\": null}, {\"title\": \"Meta to spend up to $65 billion this year to power AI goals, Zuckerberg ...\", \"url\": \"https://www.reuters.com/technology/meta-invest-up-65-bln-capital-expenditure-this-year-2025-01-24/\", \"content\": \"Meta Platforms plans to spend as much as $65 billion this year to expand its AI infrastructure, CEO Mark Zuckerberg said on Friday, aiming to bolster the company's position against rivals OpenAI\", \"score\": 0.73914057, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Forbes\", \"url\": \"https://www.forbes.com/profile/mark-zuckerberg/\", \"content\": \"Meta CEO Mark Zuckerberg \\u201cloved\\u201d an image on Facebook known as \\\"Challah Horse\\\" that happens to be AI-generated, highlighting the amount of AI spam on the platform. ### Meta Donates $1 Million To Trump\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President Elect Meta has donated $1 million to President-elect Donald Trump\\u2019s inaugural fund, the company confirmed to various news outlets on Wednesday, a move that comes just weeks after its CEO Mark Zuckerberg met with Trump at his Mar-a-Lago residence in an apparent bid to mend years of strained ties. ### Meta Donates $1 Million To Trump\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President-Elect Read the full profile on Forbes: https://www.forbes.com/sites/kerryadolan/2023/09/26/mark-gets-meta-zuckerberg-talks-ai-and-that-musk-mma-fight-thats-never-going-to-happen/?sh=671046e73037\", \"score\": 0.6410185, \"raw_content\": null}]}",
-      "error_code": null,
-      "error_message": null,
-      "metadata": null
-    }
-  },
-  "()_[('kwargs', {'session_id': '<UUID>', 'query': 'using LoRA in Torchtune', 'vector_db_ids': ['vector_db_<UUID>']}), ('tool_name', 'knowledge_search')]": {
-    "type": "value",
-    "value": {
-      "content": [
-        {
-          "text": "knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 1:\nDocument_id:c4fc3\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 2:\nDocument_id:c4fc3\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
-          "type": "text"
-        },
-        {
-          "text": "Result 3:\nDocument_id:c4fc3\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 4:\nDocument_id:c4fc3\nContent:  from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n  # Assuming that base_model already has the pretrained Llama2 weights,\n  # this will directly load them into your LoRA model without any conversion necessary.\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n  # Fetch all params from the model that are associated with LoRA.\n  lora_params = get_adapter_params(lora_model)\n\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n  set_trainable_params(lora_model, lora_params)\n\n  # Print the total number of parameters\n  total_params = sum([p.numel() for p in lora_model.parameters()])\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n  print(\n    f\"\"\"\n    {total_params} total params,\n    {trainable_params}\" trainable params,\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n    \"\"\"\n  )\n\n  6742609920 total params,\n  4194304 trainable params,\n  0.06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 5:\nDocument_id:c4fc3\nContent: ,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
-          "type": "text"
-        },
-        {
-          "text": "END of knowledge_search tool results.\n",
-          "type": "text"
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": [
+          {
+            "text": "knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 1:\nDocument_id:b222e\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 2:\nDocument_id:1b69d\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 3:\nDocument_id:deca9\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 4:\nDocument_id:1b69d\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 5:\nDocument_id:deca9\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+            "type": "text"
+          },
+          {
+            "text": "END of knowledge_search tool results.\n",
+            "type": "text"
+          }
+        ],
+        "error_code": null,
+        "error_message": null,
+        "metadata": {
+          "document_ids": [
+            "b222e2e6-0584-429c-bf93-db53059f56fd",
+            "1b69d5af-63c0-439b-af6b-db5ec865ec3e",
+            "deca9bab-a475-4955-8dd9-7235ebd0f2a6",
+            "1b69d5af-63c0-439b-af6b-db5ec865ec3e",
+            "deca9bab-a475-4955-8dd9-7235ebd0f2a6"
+          ]
         }
-      ],
-      "error_code": null,
-      "error_message": null,
-      "metadata": {
-        "document_ids": [
-          "c4fc3cb6-6172-489e-90a7-b39d343e14c0",
-          "c4fc3cb6-6172-489e-90a7-b39d343e14c0",
-          "c4fc3cb6-6172-489e-90a7-b39d343e14c0",
-          "c4fc3cb6-6172-489e-90a7-b39d343e14c0",
-          "c4fc3cb6-6172-489e-90a7-b39d343e14c0"
-        ]
       }
     }
   },
-  "()_[('kwargs', {'session_id': '<UUID>', 'query': 'when was the nba created', 'vector_db_ids': ['test-vector-db-<UUID>']}), ('tool_name', 'knowledge_search')]": {
+  "[[], {\"kwargs\": {\"query\": \"current CEO of Meta\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"web_search\"}]": {
     "type": "value",
     "value": {
-      "content": [
-        {
-          "text": "knowledge_search tool found 3 chunks:\nBEGIN of knowledge_search tool results.\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 1:\nDocument_id:nba_w\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 2:\nDocument_id:perpl\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\n\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\n    Konwinski was among the founding team at Databricks.\n    Yarats, the CTO, was an AI research scientist at Meta.\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\n",
-          "type": "text"
-        },
-        {
-          "text": "Result 3:\nDocument_id:perpl\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\n",
-          "type": "text"
-        },
-        {
-          "text": "END of knowledge_search tool results.\n",
-          "type": "text"
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": "{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\u2018Boz\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\", \"score\": 0.8190992, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.79099923, \"raw_content\": null}, {\"title\": \"Meet the Executive CSuite Team of Meta (Facebook) [2025]\", \"url\": \"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\", \"content\": \"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\u2019s finance and facilities team to keep track of the company\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\", \"score\": 0.7602419, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg: Founder and CEO of Meta (formerly Facebook) - Investopedia\", \"url\": \"https://www.investopedia.com/terms/m/mark-zuckerberg.asp\", \"content\": \"Mark Zuckerberg: Founder and CEO of Meta (formerly Facebook) Mark Zuckerberg: Founder and CEO of Meta (formerly Facebook) Mark Zuckerberg is a self-taught computer programmer and co-founder, chair, and chief executive officer of Meta (META), formerly known as Facebook. Mark Zuckerberg is a self-taught computer programmer and the co-founder, chair, and CEO of Meta (formerly Facebook). In April 2018, Zuckerberg testified on Capitol Hill about Facebook's use of users' information, including the sharing of 87 million users' information to Cambridge Analytica. Technically, Mark Zuckerberg makes a salary of $1 a year at Facebook. Booker Join With Facebook Founder and CEO Mark Zuckerberg to Advance a National Model for Improving Public Schools.\\\"\", \"score\": 0.74697095, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Forbes\", \"url\": \"https://www.forbes.com/profile/mark-zuckerberg/\", \"content\": \"Meta CEO Mark Zuckerberg \\u201cloved\\u201d an image on Facebook known as \\\"Challah Horse\\\" that happens to be AI-generated, highlighting the amount of AI spam on the platform. ### Meta Donates $1 Million To Trump\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President Elect Meta has donated $1 million to President-elect Donald Trump\\u2019s inaugural fund, the company confirmed to various news outlets on Wednesday, a move that comes just weeks after its CEO Mark Zuckerberg met with Trump at his Mar-a-Lago residence in an apparent bid to mend years of strained ties. ### Meta Donates $1 Million To Trump\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President-Elect Read the full profile on Forbes: https://www.forbes.com/sites/kerryadolan/2023/09/26/mark-gets-meta-zuckerberg-talks-ai-and-that-musk-mma-fight-thats-never-going-to-happen/?sh=671046e73037\", \"score\": 0.6410185, \"raw_content\": null}]}",
+        "error_code": null,
+        "error_message": null,
+        "metadata": null
+      }
+    }
+  },
+  "[[], {\"kwargs\": {\"query\": \"when was the nba created\", \"session_id\": \"<UUID>\", \"vector_db_ids\": [\"test-vector-db-<UUID>\"]}, \"tool_name\": \"knowledge_search\"}]": {
+    "type": "value",
+    "value": {
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": [
+          {
+            "text": "knowledge_search tool found 3 chunks:\nBEGIN of knowledge_search tool results.\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 1:\nDocument_id:nba_w\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 2:\nDocument_id:perpl\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\n\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\n    Konwinski was among the founding team at Databricks.\n    Yarats, the CTO, was an AI research scientist at Meta.\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 3:\nDocument_id:perpl\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\n",
+            "type": "text"
+          },
+          {
+            "text": "END of knowledge_search tool results.\n",
+            "type": "text"
+          }
+        ],
+        "error_code": null,
+        "error_message": null,
+        "metadata": {
+          "document_ids": [
+            "nba_wiki",
+            "perplexity_wiki",
+            "perplexity_wiki"
+          ]
+        }
+      }
+    }
+  },
+  "[]_{\"kwargs\": {\"code\": \"def is_prime(n):\\n    if n <= 1:\\n        return False\\n    if n <= 3:\\n        return True\\n    if n % 2 == 0 or n % 3 == 0:\\n        return False\\n    i = 5\\n    while i * i <= n:\\n        if n % i == 0 or n % (i + 2) == 0:\\n            return False\\n        i += 6\\n    return True\\n\\ndef get_nth_prime(n):\\n    count = 0\\n    num = 2\\n    while True:\\n        if is_prime(num):\\n            count += 1\\n            if count == n:\\n                return num\\n        num += 1\\n\\nprint(get_nth_prime(100))\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"code_interpreter\"}": {
+    "type": "value",
+    "value": {
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+        "error_code": null,
+        "error_message": null,
+        "metadata": null
+      }
+    }
+  },
+  "[]_{\"kwargs\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"code_interpreter\"}": {
+    "type": "value",
+    "value": {
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+        "error_code": null,
+        "error_message": null,
+        "metadata": null
+      }
+    }
+  },
+  "[]_{\"kwargs\": {\"code\": \"import pandas as pd\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\nprint(df.head())\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"code_interpreter\"}": {
+    "type": "value",
+    "value": {
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+        "error_code": null,
+        "error_message": null,
+        "metadata": null
+      }
+    }
+  },
+  "[]_{\"kwargs\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert 'date' column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot the time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"code_interpreter\"}": {
+    "type": "value",
+    "value": {
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+        "error_code": null,
+        "error_message": null,
+        "metadata": null
+      }
+    }
+  },
+  "[]_{\"kwargs\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert date column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot the time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"code_interpreter\"}": {
+    "type": "value",
+    "value": {
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+        "error_code": null,
+        "error_message": null,
+        "metadata": null
+      }
+    }
+  },
+  "[]_{\"kwargs\": {\"query\": \"How to use LoRA in Torchtune\", \"session_id\": \"<UUID>\", \"vector_db_ids\": [\"vector_db_<UUID>\"]}, \"tool_name\": \"knowledge_search\"}": {
+    "type": "value",
+    "value": {
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": [
+          {
+            "text": "knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 1:\nDocument_id:af027\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 2:\nDocument_id:af027\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
+            "type": "text"
+          },
+          {
+            "text": "Result 3:\nDocument_id:af027\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 4:\nDocument_id:af027\nContent:  from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n  # Assuming that base_model already has the pretrained Llama2 weights,\n  # this will directly load them into your LoRA model without any conversion necessary.\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n  # Fetch all params from the model that are associated with LoRA.\n  lora_params = get_adapter_params(lora_model)\n\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n  set_trainable_params(lora_model, lora_params)\n\n  # Print the total number of parameters\n  total_params = sum([p.numel() for p in lora_model.parameters()])\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n  print(\n    f\"\"\"\n    {total_params} total params,\n    {trainable_params}\" trainable params,\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n    \"\"\"\n  )\n\n  6742609920 total params,\n  4194304 trainable params,\n  0.06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 5:\nDocument_id:af027\nContent: ,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
+            "type": "text"
+          },
+          {
+            "text": "END of knowledge_search tool results.\n",
+            "type": "text"
+          }
+        ],
+        "error_code": null,
+        "error_message": null,
+        "metadata": {
+          "document_ids": [
+            "af027703-518d-44e3-b7ab-ff5feb73b769",
+            "af027703-518d-44e3-b7ab-ff5feb73b769",
+            "af027703-518d-44e3-b7ab-ff5feb73b769",
+            "af027703-518d-44e3-b7ab-ff5feb73b769",
+            "af027703-518d-44e3-b7ab-ff5feb73b769"
+          ]
+        }
+      }
+    }
+  },
+  "[]_{\"kwargs\": {\"query\": \"Llama3-8B attention type\", \"session_id\": \"<UUID>\", \"vector_db_ids\": [\"test-vector-db-<UUID>\"]}, \"tool_name\": \"knowledge_search\"}": {
+    "type": "value",
+    "value": {
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": [
+          {
+            "text": "knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 1:\nDocument_id:num-1\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\nThere are a few main changes between Llama2-7B and Llama3-8B models:\n\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\n- Llama3-\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 2:\nDocument_id:num-1\nContent:  instead of 32,000 from Llama2 models)\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\n\n|\n\nGetting access to Llama3-8B-Instruct\n------------------------------------\n\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\n\n\n.. code-block:: bash\n\n    tune download meta-llama/Meta-Llama-3\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 3:\nDocument_id:num-0\nContent: :`download Llama3 Instruct weights <llama3_label>`\n\n\nTemplate changes from Llama2 to Llama3\n--------------------------------------\n\nThe Llama2 chat model requires a specific template when prompting the pre-trained\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\ninference on the model, you'll need to use the same template for optimal performance\non chat data. Otherwise, the model will just perform standard text completion, which\nmay or may not align with your intended use case.\n\nFrom the `official Llama2 prompt\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\nfor the Llama2 chat model, we can see that special tags are added:\n\n.. code-block:: text\n\n    <s>[INST] <<SYS>>\n    You are a helpful, respectful, and honest assistant.\n    <</SYS>>\n\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\n\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 4:\nDocument_id:num-0\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\n\nThe tags are entirely different, and they are actually encoded differently than in\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\nLlama3 template to understand how.\n\n.. note::\n    The Llama3 Base model uses a `different prompt template\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\n    Llama3 Instruct.\n\n.. _prompt_template_vs_special_tokens:\n\nTokenizing prompt templates & special tokens\n--------------------------------------------\n\nLet's say I have a sample of a single user-assistant turn accompanied with a system\nprompt:\n\n.. code-block:: python\n\n    sample = [\n        {\n            \"role\": \"system\",\n            \"\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 5:\nDocument_id:num-3\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2\n",
+            "type": "text"
+          },
+          {
+            "text": "END of knowledge_search tool results.\n",
+            "type": "text"
+          }
+        ],
+        "error_code": null,
+        "error_message": null,
+        "metadata": {
+          "document_ids": [
+            "num-1",
+            "num-1",
+            "num-0",
+            "num-0",
+            "num-3"
+          ]
+        }
+      }
+    }
+  },
+  "[]_{\"kwargs\": {\"query\": \"Perplexity company founding date\", \"session_id\": \"<UUID>\", \"vector_db_ids\": [\"test-vector-db-<UUID>\"]}, \"tool_name\": \"knowledge_search\"}": {
+    "type": "value",
+    "value": {
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": [
+          {
+            "text": "knowledge_search tool found 3 chunks:\nBEGIN of knowledge_search tool results.\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 1:\nDocument_id:perpl\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\n\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\n    Konwinski was among the founding team at Databricks.\n    Yarats, the CTO, was an AI research scientist at Meta.\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 2:\nDocument_id:perpl\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 3:\nDocument_id:nba_w\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\n",
+            "type": "text"
+          },
+          {
+            "text": "END of knowledge_search tool results.\n",
+            "type": "text"
+          }
+        ],
+        "error_code": null,
+        "error_message": null,
+        "metadata": {
+          "document_ids": [
+            "perplexity_wiki",
+            "perplexity_wiki",
+            "nba_wiki"
+          ]
+        }
+      }
+    }
+  },
+  "[]_{\"kwargs\": {\"query\": \"Torchtune documentation\", \"session_id\": \"<UUID>\", \"vector_db_ids\": [\"vector_db_<UUID>\"]}, \"tool_name\": \"knowledge_search\"}": {
+    "type": "value",
+    "value": {
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": [
+          {
+            "text": "knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 1:\nDocument_id:61fc5\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 2:\nDocument_id:af027\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 3:\nDocument_id:d5787\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 4:\nDocument_id:af027\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 5:\nDocument_id:d5787\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+            "type": "text"
+          },
+          {
+            "text": "END of knowledge_search tool results.\n",
+            "type": "text"
+          }
+        ],
+        "error_code": null,
+        "error_message": null,
+        "metadata": {
+          "document_ids": [
+            "61fc5307-4b19-4b23-ab6b-4abbd9614d2c",
+            "af027703-518d-44e3-b7ab-ff5feb73b769",
+            "d57876d1-5073-4954-b100-b192d52d04fe",
+            "af027703-518d-44e3-b7ab-ff5feb73b769",
+            "d57876d1-5073-4954-b100-b192d52d04fe"
+          ]
+        }
+      }
+    }
+  },
+  "[]_{\"kwargs\": {\"query\": \"current CEO of Meta\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"web_search\"}": {
+    "type": "value",
+    "value": {
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": "{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\u2018Boz\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\", \"score\": 0.8190992, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.79099923, \"raw_content\": null}, {\"title\": \"Meet the Executive CSuite Team of Meta (Facebook) [2025]\", \"url\": \"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\", \"content\": \"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\u2019s finance and facilities team to keep track of the company\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\", \"score\": 0.7602419, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg: Founder and CEO of Meta (formerly Facebook) - Investopedia\", \"url\": \"https://www.investopedia.com/terms/m/mark-zuckerberg.asp\", \"content\": \"Mark Zuckerberg: Founder and CEO of Meta (formerly Facebook) Mark Zuckerberg: Founder and CEO of Meta (formerly Facebook) Mark Zuckerberg is a self-taught computer programmer and co-founder, chair, and chief executive officer of Meta (META), formerly known as Facebook. Mark Zuckerberg is a self-taught computer programmer and the co-founder, chair, and CEO of Meta (formerly Facebook). In April 2018, Zuckerberg testified on Capitol Hill about Facebook's use of users' information, including the sharing of 87 million users' information to Cambridge Analytica. Technically, Mark Zuckerberg makes a salary of $1 a year at Facebook. Booker Join With Facebook Founder and CEO Mark Zuckerberg to Advance a National Model for Improving Public Schools.\\\"\", \"score\": 0.74697095, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Forbes\", \"url\": \"https://www.forbes.com/profile/mark-zuckerberg/\", \"content\": \"Meta CEO Mark Zuckerberg \\u201cloved\\u201d an image on Facebook known as \\\"Challah Horse\\\" that happens to be AI-generated, highlighting the amount of AI spam on the platform. ### Meta Donates $1 Million To Trump\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President Elect Meta has donated $1 million to President-elect Donald Trump\\u2019s inaugural fund, the company confirmed to various news outlets on Wednesday, a move that comes just weeks after its CEO Mark Zuckerberg met with Trump at his Mar-a-Lago residence in an apparent bid to mend years of strained ties. ### Meta Donates $1 Million To Trump\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President-Elect Read the full profile on Forbes: https://www.forbes.com/sites/kerryadolan/2023/09/26/mark-gets-meta-zuckerberg-talks-ai-and-that-musk-mma-fight-thats-never-going-to-happen/?sh=671046e73037\", \"score\": 0.6410185, \"raw_content\": null}]}",
+        "error_code": null,
+        "error_message": null,
+        "metadata": null
+      }
+    }
+  },
+  "[]_{\"kwargs\": {\"query\": \"when was the nba created\", \"session_id\": \"<UUID>\", \"vector_db_ids\": [\"test-vector-db-<UUID>\"]}, \"tool_name\": \"knowledge_search\"}": {
+    "type": "value",
+    "value": {
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": [
+          {
+            "text": "knowledge_search tool found 3 chunks:\nBEGIN of knowledge_search tool results.\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 1:\nDocument_id:nba_w\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 2:\nDocument_id:perpl\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\n\n    Srinivas, the CEO, worked at OpenAI as an AI researcher.\n    Konwinski was among the founding team at Databricks.\n    Yarats, the CTO, was an AI research scientist at Meta.\n    Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\n",
+            "type": "text"
+          },
+          {
+            "text": "Result 3:\nDocument_id:perpl\nContent:  Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\n",
+            "type": "text"
+          },
+          {
+            "text": "END of knowledge_search tool results.\n",
+            "type": "text"
+          }
+        ],
+        "error_code": null,
+        "error_message": null,
+        "metadata": {
+          "document_ids": [
+            "nba_wiki",
+            "perplexity_wiki",
+            "perplexity_wiki"
+          ]
         }
-      ],
-      "error_code": null,
-      "error_message": null,
-      "metadata": {
-        "document_ids": [
-          "nba_wiki",
-          "perplexity_wiki",
-          "perplexity_wiki"
-        ]
       }
     }
   }
diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.pickle b/tests/integration/fixtures/recorded_responses/invoke_tool.pickle
deleted file mode 100644
index a03204511a05a9542d235aab184538a73de63b56..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 67524
zcmeHwOLHSxc3#WVk|q#i%}61K?C>K}tmz@xVuB>dVv!VCGV9e{qbhby7kjG5RV5$+
zB!DaenV8H3+3cCNr3iUtMvNj%*^801amWh)0I$8X7Y^_I2OJJBy^t5uLK}bIIrruR
z09mZ+?&<CsRL@iaiOiel-h1wOKmPE|KYjX3zw`$Gd~#kr>sFT<PpgYZCtlo37MGnL
zEhc`F1X0)sI*U8b;_e3@+`n@ZKP>)m?8m3P)E>vNAEwUjyYD$s*Lmot-r^@q%T9GM
zjiNy#^oBm~Kk-|Q#P{NMe{t#gIp6hWdOGsY&wlyX8;t$){+nn2WH9iCUL#4p_EF6n
z1xbw~n7_}@e)ACD?uW-w+e-n_`+hPWr04(keCdy5xUV-FouHjI8uIN|+fkSTtn&~5
z+poXz#*Y>R+=U%yVFtJh%g#a?jT%Sz{ppVu(jXo9y!Gx!etVn-$A03tGB4gZjtBg$
zpQfW^XJy4}MdP$K#H4HOXt*-;JAvo+yukpgv*Kq1tng6-5+8r)#YfJ+7`Ko7xaG&a
zW#?Wr4m*Cl?A-2qK|J(A#|u0768K#wgTr~R+YQ=&?0h%!2hQ(%qk$Ll3EuS1AZmF7
zXTRIU^GSB+<8gwq9t8HbdqL=h?ErUXV|m8`hJ0@j9Ot9GH}Yewa@fmmyWNk2#JL@P
zWJh`wN1bsy&2HEaJF$P_JR8?n*S2m&AIpbr=T?-QL~+`;j~)8$ei#j+-f8yacl|I7
zx`5%pKS@&G>)3lA_*lv8wmaUj?>zDQp)M0A{Wt}beqpF5!C>Ir_R@aai_>N2T?|H~
zeA4&QWPdbr4!md>V6FV1@T;B4ccXrYM|wV%--7VIf9!{2KLcXF;|-mMQIhyE*3^!F
zKLpbDV*GSB4npT{FCXAhjCE~ilU;TmMS}p}9|r9{HWBYm(`<L9Z#Ij(Kkz?t9=H3z
z_>V1U#38@5M?{(LKjMV<LvL`J2JOT-jE2tsAV?xi$%1>&Yx}JzI&uzzb~NyS+rYg+
zg6Ze?7<{@lPJ$35z<DQ*#v^CHKg5de58OLmf8a0kZG5R6_#HpozwdadoEE{-f$u!R
z0Nxn)0(CK8KM6Xd7liV@gHtR-p)}0B{rm6RQFw5Yb`<kOuGY5JwpX{e>$nzsCyo3-
zh2z2C6OgO?@P5h@_Nfp25WC}koSpE`m;n4(SD3`s`QF%1Qs;f|)V+nHE|=u5ccVD?
zI5JnT)N=9rz}s%%cl6sM4~WPy?7dH-_$ZE|VWOA0x2MtAd4N3vMIx<pqH*ki)D6dB
zke)h^`_V}6m2ngn`S=K@9;df7{KTd>peSkIce<ANwM)b=XDG|gNgwCM!M&JN4>W8r
zI5m%T94~P)jK=A!udZ&?ocp2kUOSCIU2&nl1`2DDKK5HNCWbXgBLxrt#(sC;gPLJr
zVrGyg&PdN*BB+V;ehmD@mv)TPFE@0(VE{02yEhu?Z61JXj}r&W-T~zpL?hDBBod12
ze*~%;_#ubsMzKtNh}A##<HT3Ox#0|>V?NUIPa`aT{f?8gW8V+4P!X7N#|ef*+zzJX
z6o(rGN512YdqbQoOpN<>9K`7u8xZ^6fIEdrg5Zpt);Jho>R8Iq|0s3*<7faT2Y|73
z2GBI4(KX%R5_0th9>`x}uRTt&C4OQ_R}f-qL*dYm$D<LD0PwW2!71;;ELT^(ZEneR
zS^M(wzQNPIU0=V-`S$%(W|4Ep+mFXV>N|%%=rA@#Ft2L2eaU&smp-4t!#ltzplZkO
zo`xM^<M+P@PVT2JR>{qnxZ6%Rz6%aDbfd1zt#i8ut0vF)yIy?E?fM|ZR$;%e0Xd)`
z`eYVBTWqZ}<JQ|aWLRQnA6VtIg}-+*#vK_Uz)W%3u{_?11e5Tb94d$r%7cMz@-Q$2
zcmaq5AOtsqGa+6a`TocOD{UWf-S{vL_{a-S?eiR%CnE=<53EZ2HRq5w3xH83GE4#Z
z-Li;rA($=8YzWvg5V9M4Ng6}S7{_2XggydvadtFoV00pN<Wfc(a025XfGj&aw3sI7
zgMygjHRh?3l;u?%8`KEgX%vM?&;sA$5gkB);BlN>(EayUzW+XG5g2{&u^{_0SeYKQ
z2VN`50PExB9$2fwBDBg6LBM=a0*u=UVo)8@g2?V3P9zR-vJ6Vl3IN713c=;=OJoOq
z9`TN$IEi8T_CmIZkPcIxMUoDgh*Nm;GU!th^uX{vD=O#$8>%O`fobp%SuAL~asU)<
zh#`;&vAj7QQo_>_j3<4v;`6f82L@oeAUGgpW#UD;lW)wj#noki4CzmSBJK^n7g(nt
z?2LgDz~0&f>DXMYZ>()!iE@D9VM&q^#2yDi4L*N+^SXo89*_>EeIUXqI39l(MJK>F
zaL-<ZZCozM>`tvFAWSF(CnqPh*dM1j^g>GVsez<`U<4shGIhrzck{a28iZ~enhvNB
z=nUZri0Ys_LE;hwmo&>=U32RjlN3hC+oNocsSC(X>$HG&!!bA}{=IN+c%z^_cpWA<
zdD^JYP4er+3vdKO(5;~3onmi-p`4D$X+crLoQuR`5iB$U<>Q)qJuiTa>`)jUKs&)y
z_wP?^;QIF3#_IJOSJ}a`_5|(hUIaZVnL*pR9XJH|QFa5^-Nm6=d6{&Eof3t1aslKh
zv<IMI9r;L%2TS&P5KOQvR>U|8qZ3RT>}~=29MsxAC9K4Mwtxi!$>~F2P@WD(kHvR;
zKC~LBGEjZ`L9ai+|3$JV-SLLxo}|p!rDR0rMkMted%$bXwQJXu-Q9^ojDkI1gMy*x
zd>A>0@pw40r}%ydIbjTP?%adQ35Ci(!v1%;d6TmRiWLO*yCNV7+L5Yt<W(KM3Sy9+
zoG8ED_2qT~1aYGT0hp12pbH3=i#P+u6L!J4K;Urrz(Iou(o_h&5OOEfa-b?E|HSV=
zCs7Qfz~t+yf+S(;FF+rh6PcAKv!X0F3z=~oos%H#D{#~|c<`|XuIE1RdXx!8Re{z5
z5=D)CH1fpM(ZVSJco4QA9^;e(sICL51RN*eBu+n*Epc^Az<5-k>p}}6ZzPHfJ__RR
zIM!G(aTstm{bU6ui1^g&Ko1O8KyB7nR<~E`n|eZeSa6kda!tksu{ugzFK|ITK&iac
z9gdSDcQ}MJM7))2i5r5C#cnSGfpF8vRg_=(y`;ald1GyLW7EH}zPf&+OanGI)?hri
zKAQwQ|K$Aa?*rH<ZeX7ioVoLZvx;75M;(6qI`-7-c&T@OFh2W@Mq_l^@j|G<4KSYA
zgK6mC?6(S6p$H8+3CvQo8vW$F+W*F%oc(6ucQCkP949_EZjC=Y{CjV_ab^1$A)<K<
zQY8e728J5?)o^L&S?J(T&~-v*ch6Zf-+3+eLHkKW1|SrryVj@g!htH@^p;b1_V%1r
zh$I;JEoWW6O+Qu~+}U%k>wBPF0bv5?ZTufVg@xw?(6?~CIC?d3e#faVO}=CHBzP6~
z?>U?LGBZt`z%%G!T?c5^2-E)LB8k=jC|C8ta6H79dJ$v}QU*3(>haJnHX-029S@MK
z=@@!dHp9HCJa_xMff(=4pCd3j(z6hI8>ZFrB-U0}mzHMPL5UX)-0#ODjO)k78ruI_
z|4sb&t7^>bS70FRf8*@0C}n}_`JKwsB!zu2eqK3*obI<Mt5=5+S*qWrIfZhs1I;X5
z+Nn6s0F=nVIRfF@9l}mG@NeR(<8)(~gIXuCHv&71vFAf-XppUfu<}suJBU)TMc(B>
z+i?y+=H$9VLvF`e%pYE?JY6wBo%et3p#RN-{<jXs{a-)m|AwBW{%^{$`3snrU-{S=
zz$l=A*WiDn&a?2E^B@AL;{gZabb$?{4umbQ(`YBh)rH-|yAK~V?%jWI_vXS9?|wfz
zN$mD49E^uF27pq>xGc&9j9=p+q$WzB;HMmS2L!3pttB8=zKn5O@5zQS-#Lzb#sOF0
za|1uD;#o85QsHq~yd0EE#wa|`8R(r(ho<uU`8xoVS1b_bjTR1!p%2UvP!RCr*Pz=A
zGZ8R5F6n3xrGubV8=dl#Am|`{7QPHe7a?UsWQRyx1IjJ}MRb>;6yO*F0jYWuDJfs@
zYYMhOTz|UAZ;Q_r=hLWRe+0(a-^^%gscoIA4nqkaw4pu5YavTgk%zFFf$=Ofmq3#Y
zWOxNIs7HHm>$EUDkg7&$XX$A^i^b>geRyHjy!sK>f*@1fg&2fWGe~QY3ZT?f@lO(b
z?C(|AR+l$<I`|t`uQ~-n4QiebPs8*N;w++aOyhUFw-|8^Igog5s*C$(f<FMDu+81K
zld`vcH1Jyfpt{K4?X^>L%|1KzP!H-(mAuBhE9plk)un|QUakwM`=qbyJ0(JYmN~1w
zIt2e!B(aPoy!|K263+ho%KRX6Gd`eiaNXUy1woyTJ<-%b1L<2l78^>V(}I^Wfzb7I
z5rS>%>Sedna*f;a`HUT-ss@8m&o6r?XY%9c@LIk#<)j3UhK&YfUmT7NA=T$+-#moO
zd0VgDPyJ!$s8rtmHJ}7;IPdSC9ZZ6Nvme3MMx(fM9j2>sc$Dl^Zry$7{sAfe?AxKN
zz&VuED(5*1-qrQ)ZxyAAul0BPyC43Wuf6dGd@pzeGHB%vjMg+AWBqm@*|=*J1Nn{v
zQF7Pn4>I-K*(qPwZ#EmWoO#sJU|h5&aYzktDM31c2`z!(0D2^<<NkeB)<8;QXl@A%
zFjk%QW16erp3E4VY<k^{TEX1SsKyBIIZasdse5-}N$bLeNJ*##3p?CYG1S@&^8UC7
zqg#*GT5ObBVg&pZ0_DnkTw>R?7NB)0nyEJD>Kif1TIIHSx<OXJ`5`=k?K|lS=ydjK
z(l=2CWvVi&v%cdHX|b0#R&R-4LJUmln09k$;U_+{P{?kX<A7oXp5RzI-&*A`dmkJO
zSZ2y=Kn%`3!F*GwyKwUIgw)*|`ZD<@=S63*Q>nOmi(DMja@Q0h?{{1|Wrc&4t(zBr
z6dc!J=Y*p>S*fqDUazgK*Eg=qro*6P{Hp{R9#F_?FnJGwRDlZvF^nVGQ5C^6gRlgO
zm`EYI#yyB<^zOHG?$x#W)^h#&CYV+BqV@Xn>guYBu{xSTiKWtzGCQxZF==$<!%`~-
zD*D4^sVp{Cn`v+)cjO2Ik3fIi0<H{KVAu}5V8z}uG4*BwoTE=?6hIK4IY2LpdayZ_
zZp0`Bkgs@X|8IV1>z5zepZ_Jp2{F5DFh_JMiNAoH(}4^ELo+A~4hjuG4<9@Nik-qL
zOY&ne0EcB3-p~XGR3RyV4GPgIl*V!)3fK<wpc%Lc2fi1dW~NS@XdjA3hi-!c0h%~S
zFo4unx3@MofCrVzk1LgTe1rkOLhiNU9TYomc7R<*7tje(aj)>FQh_ZLa|B7YBnIA7
zLt|iUq>DxU8N?9|i&b*dF)de+on*~GPP4byoO_5TL9_(aP*`+3Fl-K}gz?A=Wzoa<
zt`b-=f!3TKz-KH(4So(Wm35=RAUctU3a?EnhV(bZ9Ybm;gmXj@HKRd=qc|k79`Nw4
z$lvQLErh+mRuzOR@88|O^YCtM*x>=ie$nDwSk4UITIB$Yml7mgrU?X7@o>W**W!oR
zDb57rfNtmvn?i##s3tfHY)IU{WJR7x07I=(snlu^=Q}>^{89U8XNMTouZSYf1<x0=
zFfT5E<wsW}3!86dynKC{EUf0de5aXhpYAW@W5oer2u*iDSY6kKh{QLWn8TqD6ND%x
zhTjxOIoDZ7x)N@>E`|f%$Rl+!Fqj@FCqPX2;TWzkiV^T%v=O4vhEvGA_@s|LfV40i
zWj1jzXxDJ)ii`k9bwI&^+gup;gqDMhnX9>1&V~+`y#tn#uHeB5?1h|FJPs?+h$vA|
z98m*-ylI*DErMZN+y)tv*CsHKWCM!#Fap3sAT>OrR<a?+3w+SuyqfbK;Ehk<MpG-e
zE{2>OF=LwtbKXjs5hx`_A)H$i7f~4^jsvHj|Af>=;Xep<vVzhijS}!cP}`ttz|DKK
z1aY<UvuCzNm!^^}e{YO{o%}cog5*MACu%1v0yjKaaR-KBScP>6S0<g}=j_<JbrL*W
zmsd~Wfl#V8$a4a}xOhqg8?b~3uL5#V=;|PH1H#PY(c;SJ(b`RJKD~eN`0%;2yZiVD
zk8i5wizG>&mnaBW%9nw396goKl+XI`w8Ef<=!Ss8Oj?tZy1TogqsX^+gWq-#;WPxI
z_Q#M_YtGXZec8J{NF1jX`@hY=DolVb0yr742Y6@^abx5g`}YNdVo(*tf^;BFLL|=K
z>Fll~H!Brcnf3g5Ht93(d$8kmKC7L8;C@B=^3$BYY+OoTt~SNpANvtCjUQv4hd{t=
zYpK;iUpUR>Hue5uO;2_25D4Hg@&|_!WCNB!D|w4d4!nUGQ<18Mm?iS76F4idDKKXA
z`by1%kA=^Ma7KGPN0umLWGO>@;soApHzT`_eU)HMk$8bMCpb<(N1Qx=hsIY&q9V)|
z-l7np=i10C%?xTo)H0%>_;!J|^@;~g^s-#k$#F?u((>CLi9tYLBphrY8mGY27WD|r
zpdmj{C@`x)-+gSTGtVO#qn=2H_JG&=9Vj<KMCA=UYapT_7ki43POl9eaf~iQgb64+
zV$1Ltq!t*rp<UC@hK&$`Oa{-kf|O(im9065M3j&fIH`G0HYd&|qrXCK@>hsxB;!vW
zKXDV9m@sgmY{P1@)1V{bo-GMoJkrpq5lLVO&xNBa_Q3f;2G_7tYaGKdMONqy8oJmZ
zdWWjUaB(IWSmJy~IcK&vSLDblQi9^haY?{I?lW2ggw_E=<N1Izh|hQ@J_Bdq9<>A#
z7{ai?HW?BEVSp^MP`LXP8r!g<FwP*>qf_j5sN~wbWzTskOV8u;qwEvEEyR(=W9z63
z%jI9MNW}i**D@jL`ZN(+&xIuT7~bC}?N&l!l*E^;W~K6ko@A)>?LLo9#>im&V2mJK
z$M}0<WV2ZGa{Gu6D;Y7w4^B5d#dBD7bbC_Dte!Jq>@NM)6se(^;0}@G%OC^kbAZgw
z0Q>n?o-_iZs8<qE^Q{p%FPl+KXjoDksOoGi;rgv{qXqpSf`>Kx&b__n+>%&Bvyk@1
zh%3{TP<;$HQqgKCjofpxA*xF{2zdm&gkXLt<N_75<xrR3h>1ZcL*yZ%UNA!tHy-?K
zG=b-dui*h!bi=T8lxc9OQ4gsrQ3#aH;=U2D(}=>rHHW*5W%WJ0B=HAbH?weLJKpX`
z@^Z>oPZxg3bNzx|?xXC}vHe71w1B-BjZ?k!+`NTS8V;7mjX@$I_B3R=u>21Pqdgj<
zDe;wm(V_H5;6)kfG;)rPK`)lMnEB!%hB*-ZNMp;hB0TcmY<Eh#T$SIj1|hc((n+#c
zoU)ndb8KYJA2FhX5Pd|9fQs!P0;Iju%y+U;zuI{|`_A2iJB0a_t)ag4RU(|=8(^x|
z@R^*`y!*>y{ygtD_0#Hk-_uX)=dhM$mh5jKRKVD>|NX1mvJWkpfCi}Vmt#KLl${&5
zKRYY-42$p=V}|~-uV++w!VK-T)>_*Isth7W^MouEcpTNIWs5%lGN)3!4kCeQFDRFE
zlJQ%U`h><o!UEzUmM=maY>>jX!7~(|RuHqD;=DplH)G=|${*g~#5-jGlF$I7o1Qw&
zW&%?|+iy0P@tJRFFy<!VcOGi|4!V&xn?i#~@C+msiN1zysYsg@409A-Iu{`{(1Ike
zPNd-p=qk--aj}pkL5Ct>QOHWaCF$t^vw$KB%Licuz_irRN(R-ixd2Ca?Dt0Lg3blx
z6=Ahe1xq7pn&|4Zhm#vY^FiF}AdL_TfiKIFDsa_l(Y`it;8`0FIu&7X64RbUFA*Om
zxRhCE`1Cchp@;@Tc_!{Ja<z5)kT8vP6}xCORzXFiFtH&UCRMnjy0DTCN0ZKk%yPG4
zvli4m#6gswE-VR9h~?cAvD9L0RW6fKW^digKIH<CV&UGx3#9iH<f0rQ_ZIkhWq2wB
z*Cy^0B;8vO@xAbP2}2ZnFb)LKs5ka6x=%1^FUJ=JQYn8)z&Q1W16|N-&~GWcz%eY-
z%_gWyl;Z$(v5O%cQ#-jRQ<6m1RUj#Bg;}`~lE8MH*V`2Z&!EyvfUyWyA~*&H-JTtX
z%Wf{^TeBnQt+X}vG13l9m|B&&G-FHfyxEdj_sDka;5_N^EizzY6|EsWz$3`=q(vlA
zNB$`?^`P*@J>iK$yTF<eUSlSin;s^tpQ4-El~_5h*Xpaa)fGQn86i#y#@q(dYt)ps
z!tucQ5yA%{t&?1{Pt}Sqq6rs9D~K?l381tx!#hgMownwm2ss0tIaR?A`UGe>z7MIC
z!TOE=L0f@wN9I8_5~eo<aKcK5u&YFdOiYNh+8j?Qk&x~YCeV89f{M*MMBp7m(<yc-
z(i-Vp?vdfleT<ZX<y4hn;T_67K(xVpCKNX@2014rdr-2VaKV)zo094Z5T5K{)GBJ_
zJn<KyUKe#`k+G3CQ@Y)(4Cp>oH!dY4Vo-3gqT*u(p|#`*6A-CZPpz7<yT(Ro)F2ql
zHK_a{Ok{{5hJ|HE_G716U|fc%4)vZ|PDpF!u|w;QYc2x_vSiW8t?o-_v6c9%FCX6w
zdK*mW(AbQdOIS2B-3n)JB1SPcXsfa2bcnSsh~Y%GRtD9!ax}l1Ec)R_F2zjv<=XAd
zjm=z&ffTG_JhzT|qT1QP;`T22Q@KZ@$e|iVWTWefmsC+Me-Z{ZoCCGsihcx-EjCbW
z5|Yi=6g4OFYD~GQHsPk)+@il1l1+YmT%Y+fuwSah#u#iQ*CBv2dAUNoVwjA)%Kl4=
z!|l$*HQ6My6d2yjO?3^?j#WuDC&?Qy)(?aVAsf)yAaV%sm>}O~E%y*UG!`5F432M%
zP!-G>wjzY(%R_bpN`jc4MMC)k0rPx#2ZU=!*T_6U8UcI>5Pesa;dVuZ(I2eyM73)M
z55e_QSEd3C-~|_<*F!Ow<ENbnPB@U1o#v1^ld;>1U`hYoHGlZKO-WsrMU?GBFhy+a
zCJw@Nnb%xe1g<owRssVK5<q;6hm$mxRHGbz!!O=O@{}YUA^-b?s+25D#&U%(WIL5T
z^RC()u&UfJno6>zG_FOf+)O^Ac|r^`QS8mlpytIC1gyp5DyxO8W+c*p>?1JNx9jyN
z#uqq~B)DBi;Y!^CxnyigfV3XLEZx~`e!DcVb0O^JWzav&{Dc`k(3(i1l0R*(T=FoF
zoY>W@3ywSiwB2A>7Q11PU)^!L_GUG#WW!fv<AL~Gx!)_>rYe=DtmjqEv;^*8BRk#*
zJkg~c2JsuhnMph=8N0mTyc7Mv)u1g<R`HlM2j#&<P|gtuxS*0r00P^IC4Ux+m~Xr_
zLD0e&`jV6ax(F%cXt{#Lq36~28B)WQGQWW3#K_PO#6+9{h>Q!FHP%F;qLtqp%bQwo
zDgZJ}$O55%DT|T-X|?{)EykmR0gS8wNXfDJ>TFVW8WsGQMDX_+tBIOb$Vp$oqePP(
zse{nk;#+0Dn82iwWzsyN1dlv^99pzp29NZap$Xd^U;?z6P?e_>H^ri*IadHpHX=g=
zKdhYuN1$8~93`fQ-&WqoW;LohfrdBeF*|yRUOLK&B&?kvfqHBM10WEZ4TgCsjy{5e
zQ|J4V*E}$fHA;`+J)n}ekjt%HfYdc3{7x0Fi3mBw#7Zo9;T~b>;yKPSU}>2&Uc_3G
z8YFXbli)}+bPz-l!y|-2<2fMVja%a9W847<h=xK4Rfp3Db|4cjp1(msL#RN9oDV4X
zggB~X4@^TG5uzZ<6o$Z$FdYw<E27CZSvulLE?bOFuP9yoCF18zsNRHof2-B*7Nm<N
zu^L7=SO_Fr9!h$0;*GeGL>gfJ1;QE)ceRn<O=~<u53m(p+{7T5T(b#=-RQvTCuT*M
z3PhD?^ajX+N8Af)u0RkBS`cA~#E0+Qc~3sv(aXq#?C^Raa7BKmAe0T!O*$jh^6w#m
zi8Tel{;=f~_?<_P*pbmYaH{tn-+8pe!T#W-f8-tesyko`7Dz$^JBD&;Q4gFZM=frO
z&~!N+qSOFEK-E?VVy&MJ2RE?*0AUJ#$bDDb&j0?a*>+BNQzwMQ)y=m^UWn>~f-0A1
zlvX#Q;B>T75~zlz6=Nw4O9lxO>SzQPo~sUg(J)zc3<IL^W|j=Iq%x71n4-`sC!jw<
z9vIz;sxb&jAZZ#Q*DD3A%nN^@YM#>BEV@LbLKEqfO(d5p;jX0(A2vEVI$#@s@+$Aa
zGEmsUG;{`-J9j~4>7vkuU66?uNQIemQCKT8%mQu}J|aQ_91ZB08TMhNCxhUASQD4f
zNAoUfG0MCr<RH*{<;Vt-j8GN=o8Wb!*aY*CaUf}6)5g|jeY3N+-KnpwZgsco?He20
z8*6LbR_A(mwcAC$EE4v-`t=nZI)Qc#CyWIM$q7PV3la<&b|#4+(GreJD`Jqw5>M2>
zW8;CiW10F%y-e+cq|-)%5>{mWtK@ta2raeB4-9q2YeI_U^1ggLvR!?h4VamdFo6dO
zAER7_$TI%+okt%ePF0w#$us}{`}+^&?4(S5$0XM_-?@dvC?ZYC7&Q@)MW9?a48d6v
zXN@1h;@S;_;WT&+Np__LF47g;tX2Wza$31@OVCr^SvaMI&0uypPYl=;SDwaavQh$$
zJ&G^g3S;D;u7cx&TwFaUsB46i%zdJTn46@gi~M}T@|b^7tM%Sf*aro58-<^Ej-Tgy
zU-)Scw`Vv}xZdC+O+ByzLq>)$c+kzvD+VUBKs6c!VJHVNLJ$|&HK)F?pxC8L+OUy~
zDgZYZ3>lDDk*z_`p^uOhO%jBQ3a9bFB550*KH`@GgPED5W~@1IvJyQCTV?UpK7c(s
zmS|9HkTvy`@S_PBB#9pjkA19qutMOWT$B(@&CZ{940y?2)K*H^sx5uQh%A^s%`89?
zi7FSYAag_|7{+iNVsJ#awIR^qjrRNq4V|T*>4CA+23bNlJXYZpLXvt_mr)!>+#U=h
zv-)&FZSzng0Z~yUUo_SgF#$kcbx9Y6VzWjU9+{?Dd0qI@xwJ-Nq_vBbw{L8cG}~c2
z{DE3!O9o<j&p^l*9MXata;fN^WNwJMWX0JgUf0Fv`MngF5wD+L@CKv4w*&ftODOmN
zKR;chU{Iu_*xV~J{XhDZjOkBAA56$_Vil?wr%8sQvrXPxEeL50JGcX>t!PK2jpb!S
zWPaTBMmElruy+Z?r1eUOczJs@PNae2M~r|g)5P_aEv1Q#@?Focwz8?UlB_vRtu0nu
zgaOm*Rhp&js(r-{)BgR6F{p9{wK@ruSxOLUNiB%}B0Y;WNPUGGMkd0>ZYIfcEpEcm
z25KZl&H^~KYMHTJ{P{l|LtcWCAe4a$w@y71kwjO3u%h%Y8~KMEx=F&W(nYgr2QqRw
zxd_Z-qYBH%U`&8e)_Z<?dOa=o`&{<29GZ{?E##X$bvO&L16j1hf{h_?CLZKT-CNt7
zye#K+Z@pf)Zg*VcBJyGE0yek{^5gUheHIuuwGamNs9#U50SMG(F(Xr?N(hr3M`_i1
zeHNngZKWC815jXkeC`7T3Tt#O;#g3e1Fi<v8WZCVR5G5ET)H54n{A)ZRZ2c$RZ1M(
zxqb@mRS<3eOR1mspXO06CV#LHG1LD82FZL|f-~ATw%oO?)vmj-)o!`1mfv;TH&(q3
z@A|gCQSY3eeOnaT)mGQNvEjAcjn3MdyVY5*yR9}V@@;H*8~%-L+$Jn=&EIghx7r)-
z#+rA--MZ1;cDL5o+iTuNr{1b>o%esz|C0+MaDMXY5je(*O-4x){y9Y6yw(2>{wvjU
zcpGb1j=XuxW`X|na}2-v>8vc!xB5Tr|Fr11nh3vXx7u4<bH|l&V9gy@w0O-OS1NAj
z{tFTRR7agVu0&BYcB2byKv18kH_aVa1)JyGaYZR|?zsAjI<78}E()HG$*`ERbTRjI
z%sm}*PX|?N>$;&2VD9OViXC%L2O>7aqBHk&%sm|v3NKc}FVfR-3Dd9Sv9OcTDqGw2
zR&G)-{#$yt=)Dq0hUilW-fJJh?_QA-2qrWep1V`yF49{O)gdPFCL;rwn#-u`Wyi1e
zekM)}7RF%AgxLD&gl5tasdm9-0E@i!6`1%71|*?}fVy`XQ-K&FDX+pX1B$_gr%hNI
zDu^NfwPI2fB{attvXRx&j2SwCX=m0*HybQr{3MmP5d$;B()<81Iw&g|KWA<b>duTX
z10<K?Dks9Q9)?R5V&e@ET1E%A5Mp@;oeyH>2AvY9Ql{-N?h<|qakB9h0-w59_yC{Q
z!c%$JoKB5dhkR61CNFpt=?aB-yYhqTG`QeF_2FLhPDD!Cbj%B$hd&W<UE-*hduKoa
z_@AJrE*8?b;l&?gq8%}H>=rq_OUDZw((tyLa%g5bu^X@_v~dQI0K^Dx@#(_Bi(MAC
zBaVXZ5{jO{>!V?t677qAPbi*4U$TU>F{;BJG3+Fg^eci6Gn^8)P<8HtDYJSKj@j&Z
z#@dVlbrpBehT+071##$vW|$2F)EHlr?+S4s&*RXt50X4#0xXF^QY;LGE$PBWriqUr
zkXo-sNKwqXH|9dfTM|Su!LyAKH3?QOBH|I*9$tbnaohln&9(I;3V^Wrm!vqbZ;lNd
z)eSMwa+4IDb5FlXQqk%IrqYC1{Y`uUo_R=v3AR>Z^=OGk;qXqecvRIUTfE~VGDi0F
zdnbNach+jxwaD8lJ}gOK9}W|SykIR6A0j0l5^czWlU3e+U3`h{JCtx9Ixs!BC-m$S
zEGc*)N4D9N<pZXmUMS)mY&HyhL}?5%7S#No2?cfvF23?~KCQ2Dxtx>zS$G8*m4H;W
zX?OG#<OS!Ttg}8e5yNnqi*O41#|Xq`otfnexr^*;0$+6&&kS;e*bvb|sFom@C8PD&
z+>D=SxCFveNH<V_u?-Hw8|_%d=!i>XhJSu>NMzYiV3~^zjgl2gUJ}kXug#!t=3HO|
zrI^=dxYW<CY20NxK#qh85WmQ^84y?H>`q77NX%7%y47(X!(b%jPDB@QJ#bjbLnM7D
zXDKKLEQr2jKUsJ%m)()TKn;mO`R3?k*MzJ!vfB`R$d+1oE!AWO0D>oj?@@e$%%d@D
z2ve2B01;$VfVBZQ6nzoh27v)Dm!P)6>5RZM8!~eZolYcn6SL7c+S$1sRums6?8T-g
z9TiG2t(!M*LeRi4RfeEw2xd*C#HXvzYb5&24}Zvls%){iZzWGn53j0dy|m+Kd_^?~
z8(rUH2ccwd19^MQz)vSHwE{2MlRglddtOy3`_qo1BKM~qMI{l@7(VS#NE>P7l{hil
z`%<X$k!eZf4en-TI#FrHaZI(Dd|RcSACbxITCSlQCs%0)g|{1yEeg^r?qC)rV^jaD
zoj4j{JplO~{Zt+8A+4G@ODM?L_KTb7T#?fr=Gys09O8DT-MW!$=af{eG18`;)ugnj
z91sWu-xEa-vUWxf*l<n5clD&|9f!8i@vg+M%^0;DP--Jdd4x@wiA-10tzZFT_r9%p
zly3`w$_iDP0K|z<{Ji|K5y>UEUno@e4yjJaSYdw@AyrleH5p+x%H3A;5K@DT1$HDu
z=r-b{M&p?2N>J+|NVBdH>R8&iT%wYN&o%JL8fLGCV{}^Bv=j$Q#%C6CS=2;GT@N);
z3e`W@LC9JFOj3a;9-!LDlt$__izY&x)PQ4VY@!&y$y;a4akD8<4p0h3%chdt+ZBMy
z#jxcf3uT6#4fU4rf0s8&S}M|H5rx0o9s~_0XYwnVidJJ`rz|RaKb!RuRf7wKCNf#3
zOF5DrvF>}j`Hx5qh;J4i%i`s043eL^bFwZ-;mLeVywK3cSudkO%32CoOF;y2AzkJ3
zl8>hwYg6$}mD}dzB(RZCPSG7vbpYKg*<_nJ0+E2J?DnfA7umL>Z|pH*+<^%w?5U;Y
zfbw961yIcBLUT23fPv|GatP7L7E$4`L)M{|%@J}`2>r*6mBP4sE38;qFVf2>R{kLI
zyMwlvF1r~zyLr9GLcW6^GaIZBKS}&BM~U#tVPYX6NE%s?2y<i$q7I3ISa_;5*@$^!
z$=nRnfXbS*4HTBWJu(a49yNG4OsQVVJMCz6inN0)$w4_hVX@^D6pq?e#xuDkviU+}
zP5u%Z(grjYqDz6~&r0SjlNxNsfK*miIZDeap|}Gp)q`KsoCOOOhNeVFqa{0YqMIIr
zN_F3nV+J;A(nb7_V2#Xa4UA&t^(){22=x^fNn`gJ!!K#NAnyW|KAFE)mkHEIFdGYP
z7a7<i`6>DUDYx7qx|QC9>=c`>0_1p4iV|B3oF*4bM<=!Sioj4f2YCh@4@y>i6QtK>
z2jzF598Zr$H}4@)yTnJZUOM{JDKloo5Q#EIqet_3yv7JLs`a242hj*fD+YLCPgQzJ
zCa7hMzV=J1g<BzrV_r@Ac#WEE#%SR8mk;{xLBBS$X4|(VSaGAZ=5BYkwh(7}z3svp
zvEknEHd^c6_07$-t@e5UzaY|-E{N6by1Ti)jY!iQYbedVwz2Md_4W2<yS;v6ql-vW
z+$hq}#*L1<x>>*BZmhO3BJOclQ6aV7+H9>}_qwHmaJ7Ec+~hB=9FEGA7xsuVwW`X^
zu3p;TN;L<x>S3#e>b%>SbQ?SSuV2vW+Xi9S50tx&upfmh5M)3!TmF2POwq)@yz(w(
z4S*h@W8MJ#0F_c9PvWeY2JphE*&FL;+X$!}$SqBEE|1|}2bupoI{ADZ-OWsSfQcAx
zq^*tyxk^`9K+<%(h%AvWy@piALkrM4nel$iatR&hG3*oQf(a>qKSb@F-;cr*s6j`;
zvU3L=RTAe1FuTxih+DrK^+W8+yATU(r)hF4B&e}nA$!e)$gTqC;a8VMowcGb#`fZr
zHUC+O0p<`?QV-A(mkyo%`%5zRkb(jf11W%mEf@EqQ-F?BLYG%4bf_Le&Y7An!_*j%
zC5s~KP+d^i05V>zHl=JXkX&#N>Vug^-vpFsCH>Jc5DO8sP`=~{mL$qBI%nmvw-5Qw
zsdvBx&KeQ)rD6aUe%D92I0EY90;mF@T{7PPy)pD}xe+QYel?jp==w)#6v`%ZbRh6N
z0Yi2krznz=)}CH}USz`;_kKG2V}mXeL3tDCBK!D7gZiHhs1u<CVascr<d}Vky+63M
zFWW$OD9Su=b@tI^80(L!XlvUW+t?+dB4>mGFzhvCJlMD31N74tmvTQzA|Rp7!rsS?
zNa1s;xAylH4f*te{0>YlJov!(P#e*y9^87cbj222S4p(_rtFCjy#|Z5ocI6w?CUal
zez_nW=HI<KZRO(0!qKaZ>UW<PF`ev#_qU}4nY<6)RHDq^m_(TuZ=e{;=SOE{_*v;P
ze<Q!>+bvE`P}{EzPF`h_9~QXE<z!;cRepiE%2iM&V=ogl`-@^PSAzMKtBDsz_0O){
zm3*f9niKT1DSH3npMl;L==_U+gzx>oGquM*eDE5bZRZ`yDZ;&^Bl%ap8+kr~SM5YT
z!J}VXBgPf;SaUXg_OD@<hxLiQ$zhp(UU`bH;bG*|pR;ecL@%+d3uA;bn=Z^9(n8#H
z57)<RZ}V4w9soQ)zxn~p`;)&|#Jub|>f-a2rKU@*HdzB`m7Kp1=kP!pIKQM>IiJg2
zkLpoor%&lsZUj}9CGN6*alx`+Y(LsZS_h!{f%b9AaxlFks*bSCdN?yw<ZnmM8njMD
zJRNlnm|oiw)OyhXuV>eIv4G_kyqZQX*_JO0UcFcXpa(DBmcM_8e`mSzCG`29uS14s
z8vV>g>>e9qU)Es!HSp`Uv`|oVcqSdNpDn@+Ftgpb{e5=B@$6TLZv?b^&lcbM!86;>
zJnKMi?-Las@nxn9Ibnx+d$JRGr`zz3dlxk#w_`0meYSYtw)K9NH6X81yGL6wHV!Yh
zBQLy+?aSd`0NxMG`Ji(B>J7^$Aq7ZmC=O_r(L237iwSVE*5%I@BU!M#dHJ&oyO(D-
zn)c=Pb*3M=X<%Mj7xUBX*5!*c8ex;HVfj!8mJJhqbVq-F{t1dbyvc^v=V!kxVbkX?
zZr~XFJhFq|#k?##_}}>a*|39a@Rh(7OXB-1e5pnM8e&vV5=-3|4zd!Y!Ubqz)6=S@
znGI3jL31G(ol)t3#QyZxoZE7u^I^<)YwjQ{%v#`c13#?dSu-k4*7k8pr?}D>h35;8
zcccw^HkCVgDX&-{>^uw{7(*Wr2p(sB$8_D)(XqfWY)>-j;z8EBvp__b7%B|%8lI59
z>HjVME6O+hzx&egpWh<SdGQbBZ%Y0-|H*WKv;TBuOr2#)Cn>g;V6g&!-sbdK>pgAm
z5%o&``eKbeMLWoZy=Ow7o!dd?c93enXea!Vnb`{#i?Z*k2(z_qPS|dKu56nx{sAay
zF38iOtc>4(F-dPfIe+nw@iXX?7-ICVe~K#?n`HjSTo?Fi=mK*k;PsV&0ggaPX8Jj?
z35_yZFg|;YzK$AgY~!L2vr4-&BNIMNU&ohPP$nfRdBFP6{uO3<xmb1mQ^YE~;P6UN
zOMFIQ=8tm6Wwqxvy6r+k-mK7j9&P6#2?P-Um-I~9n>S68D&k1OnKw;>X(r5D0nM8x
z&6_68n<hyU#Ch#G+CS#C=f0Y?=dLPg6w=ElVp-Z{Nn@T~jxMTXl$SI~u9uo5Gt(Dl
z<gBg%zjJ_jdigxP9KG-so@)c>yy%JJ+(w`!jm)M&MUFcSBF{X%yszhHG8kZ<UJiWv
zv^n7>eUf#1KJ8Gyu<7Mj<+_FHuM^hf39dV@{yMMzI<NjZul{NhN+nh(PnXvsaP4`Y
zJ7$VcCcDr3+|B#kF`8L~=!qI}pIe{1s|o;roY$I}Y;-%Jc8j{1cf71<Ry@<S7`@6&
z&yyI1^86UVs3<~Wiwa~dXOXRsXhP{wjimUXtwkA14F%C))#$d@nB_GdN`4<}5um<`
zDZP~0%KON<qhW}Fa7^7tI+?8jq<z_0m-8BKvGq2x$_2LY#*k@@k!j;e(mp+<0?;8^
zI7&BV<_H!G2_bf@!AM*b*9h9BDp7^2C~nDGt?kqn1e&Nw!6wxxvA_}nXj~1=%Mx12
zhcitYiz76(m!PREiziWx0h7gAaVv%adVymU?1cXjmd8OMRtBU2bXIp@A5)|!qN)qm
zf2WxPirOtKE1BRi7U5#)f~<59>VS&d%$A%ng}bCGgx~d00LwWJJb8uD8xf8eAUq6P
zr|>io9@jt_5;JxK<WdR*SC=MK%Dgk#Rfmu&A5zFco1vKra;`)kySD~4ShYhs{n>ID
zQUQdmlVyg5+8;m=K`+*{P?T+4RMu=pMa>+er>!;l9NE>oF;3YR)x;L}5GCt%yb(e?
zP0cjcZAzyqsi_)z&reYViMf$lUo9`{qZqAdA^WPSlh}E|D^4~;XY=fyGyU>vj_;-d
z*JD&niG5UoMnrFe&DOPXAEMdiRvk-ARg`a&qGnRC3E)a870K6~HMzoDjrmH4ohkyB
zqG;Hl(7{!16WckEQGPUwi1>a`mFm+`4Mkx60nm;NdxV2gJkj~%)g{<QFa+rf5#Y6l
zaRL;bM)Y%G!DM6&ij*VMU07I<4?ikG{E5PEHdcGk{%8`OPf*Rof>^$@io}xID(W<6
zzuPISlon=br&jNN@)jdM3ur`AkUglW(F&yOz~+sO`sV8Pc71gU#tr21uCH$74;P>U
zJoBFESXtR?thXuhqC^!GhK%7Cu4j57DgLUG%dh2<%S4sD3Dd>AaWgQgBxl)%sbBiW
z%~?e($Z%l9QinP7|9nrB(C?S3fD2di9F{yLE!(P99<v^M-X7c3x)3fe)x;&9S_LN*
z7Bwnt3&NPGbTRc>(nprR^B}8fk$s3&6f+Wk&~+i-Yu$O}eR$dye;2V!0K-M}#0BG7
zuh)=+mx9PrZWBor6J>K@2qMKtDRvLNk4&@R<qA81Lq<dC)*k<bFmr@5XjYozOLljq
ztR_U#AtXhD2g!ofqVrY+vCsUPW4;T5c7%Exrp6=HH324HwpyvAZZ;)p<m;fMDP)Ea
zsX{0WB8}sE=t_%y!TZ=?8Z*&a(|S8oB+NQM(m<gP6k{OKE1@W4HnFjT`Pzbtu8VqP
ztTwoYmeoinh}sQk8x3A_>dL>f;G+)G;!w+H%pw!z{?_#s)6(8l@0-%f_VVWAi&}Mc
zad}Y-r{Y7wIKBA1I0R?Jmc}|BoBZx=ZkUSstTVV*TalmgHEpa_DyZndGC6UOKyoq}
zgw<piQ84N(VX!2DDit4N)-69h@ll*lc1AR6tRoSF;%!!sQMOt$6R)qXLg(;D7(op)
zD659DF09O}3myzne1B)hSrPRmXs-mr-V~4op`jquMqzKNk{&QiN(Ul`vWJ?sQr<(6
zfom`4@yT~?)w2HDQbRurx=UttCKizc+DmN+%`Rwz8TNYo`UIGB>j4X@#ZBR!KrH(z
zmbVPhWT3cY(ocZu9l*e$Wh)OoRZw^E9bgFsf#^9N;S<|ypr{Ovh))R+1#V6YmFx2A
zh4N~RBIFR{-*!HEan%K_JqPuo)3Osl(YcJvXH4Ffp=)R(a7y?cKk9b1P90Q=LtGG5
zuG_WE!g+#Jvd00CJW`h7$t3!hlv;OZ{Hav#6SG8{HvM~w#aDscN_K&z?4zQEQBgqk
z76?%fG(>-ff(YcYP<DaEBHv2=R^lMBmPgdZnZ}eV%zeeig*~JnmukTd**F6y1CwMa
zsdpZI03s(X7_#mNtu8dY6m)dApQfW^XJzHpWr$a=bypBh{Hes#nS}$sEz(H4wdroI
z-Ke7t^S1A9ucCW$Ykj-3zOnAFZM0X<U;GD#T$cYThOhq)e=a6N2e@$MoFY=z3rpBB
zAM641-?;s^AOAcgLtmay{hImEo7@5>R!(PEzF=PBmnI)=55OF}xG8`C9{<kMD2n-$
zfA|_X5DT9&A6iodr&25zvoQZ!J2|!q$~F^vkrt#2bD;It*%{F0HzXet$+LuKCKwCl
z&y$^JBs!Z*7iBtkx=+pU$cn6S$fc+GJhF^ulk<-}aXDYFp2a)`1}YaKdm7@U=4R*c
z)g<{Bb=puH;K+~n79*}?p1FKs;<CNX<Sk2<w7q7sma}KGq;7k|?A+yrtEVmWzt(>f
z|CN$Ezj5|gl#2$i;dG>M1uIC96SjSpr9-JxG6*nJ$l8!@D3N;!im-z<izIYiYToXG
zeZa_eQy8|`-=wGoGHD^FWJ&yx-XF?+ki_q?s2s{;MDY%bW9ajeW!o_oG2~%W7DLKf
z^ndN3|ILH`xAIfg|At9XKKrvPQ|K(~bE4-xrHjuoMPcUJPo^c{rrrMT=a-AnzqktC
zW%&o~&5g~xx?CntWh#Ia1D@v}P<!$m;~4=$`-5CitX1yM^AC)6AT=v#PUoMu(HRaJ
zxmXFHkyGBL7Z=JcD?XzanGYu~YGIF&)7}%2)=0GW@qK7>pwGtvs>497lYvOl(6S&z
zA<m-CkiF6z{aMb^i4F;qkK{U~?kx&)chU9`w9&knEdWrKo7HB}ynsOBypVFcLFx}(
zgtl~`2ku1F*Q0Q!$>sv{{DXP^!94#!97NQN=J^L-&HRJQq>D>(4$9Jn(ovY3ihOk5
z(yk`*L{gK+3iF%;_~0jf0H!%eyC0$9kh<mPIS2Ec127aDlET_%^OklnpS)&cym?DI
zG=UR3{CZ6ZKf{)Gm+0}A<QbGX?L0PQo@X%6GnnTY*zi}HUgmiQaN1|iV=>>%^9+oa
znoeTkh<NyCna;#3o;m2@q3$9og~mI${n@k*{t`yhKu{lg2x0eNG=rX3u&9Z6Rx7tZ
zmDa<IHHePb2_|8Tk%`_g5!Djwk=ddP;V4b`wP}u<5%cnu3Ck*Y+AmYDFUc(^GksNz
z=D7tinpw|t3xW|W?DO1$a>)5Sw}3eVLRBR^pY6y%_Dd1<np!i@E$}+?+yaCID4s|@
zlSCBe89b)a1^r;w0g8x#EI-V6+hFt;_nF4*p<^$o5?aKep0(|Ujo4LxS8!7Au?=XE
z?8T@<2{??HR5bDs#|fj4$Zm+c^^YlM<c*ARL)vGZCk^#8Zd%P%5<EA;%49-?_-9o1
zL7U#xJ3=7dTz{O<AFn8_T#`>vmR9EZ1e3wqUm3v_2UznI=lKNle1dsC!91T}o=-qi
z2qkHiIp+BUGxG^9v2t9YLu~&i_P;_%fB#!qoal#tgedJRhl*Yr%x|Jvc~7xTkVYYY
z!BPH<qG15IBVA``FZ0na8s;$ewkq_8$(qbCXgja4_PIAb`_m78<qi4s$#g@d|AieY
zg(w$lseffcm0zm^-~YkPZ9MazUl~cWc;*w-wUC@J?mjwHXa{4z^9$Mt{>=97nCMgU
zmFeAa2_-DH;h8Am3brzDa(`dC#Lt`DS3qA)PJ!*dGf{hR0$sigP42J4=Ko*+uls*J
Jo5hc7{~s?~1&IIv