From df4fbae35c8e93d7f97b0de9781654f3a65b9d9a Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Fri, 7 Mar 2025 12:45:08 -0500
Subject: [PATCH 01/58] ci: Add script to generate changelog (#1463)

---
 CHANGELOG.md             | 139 ++++++++++++++++++++++++++-------------
 scripts/gen-changelog.py |  42 ++++++++++++
 2 files changed, 136 insertions(+), 45 deletions(-)
 create mode 100644 scripts/gen-changelog.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b3d937c86..5a9911915 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,15 +1,20 @@
 # Changelog
 
-## v0.1.5.1
+# v0.1.5.1
+Published on: 2025-02-28T22:37:44Z
 
-### What's Changed
+## What's Changed
 * Fixes for security risk in https://github.com/meta-llama/llama-stack/pull/1327 and https://github.com/meta-llama/llama-stack/pull/1328
 
 **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.5...v0.1.5.1
 
-## v0.1.5
+---
 
-### Build Agents
+# v0.1.5
+Published on: 2025-02-28T18:14:01Z
+
+## 0.1.5 Release Notes
+###  Build Agents
 * Inference: Support more non-llama models (openai, anthropic, gemini)
 * Inference: Can use the provider's model name in addition to the HF alias
 * Inference: Fixed issues with calling tools that weren't specified in the prompt
@@ -31,7 +36,7 @@
 * Move most logging to use logger instead of prints
 * Completed text /chat-completion and /completion tests
 
-### All changes
+## All changes
 * test: add a ci-tests distro template for running e2e tests by @ashwinb in https://github.com/meta-llama/llama-stack/pull/1237
 * refactor: combine start scripts for each env by @cdoern in https://github.com/meta-llama/llama-stack/pull/1139
 * fix: pre-commit updates by @cdoern in https://github.com/meta-llama/llama-stack/pull/1243
@@ -96,13 +101,19 @@
 * fix: Agent telemetry inputs/outputs should be structured by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/1302
 * fix: check conda env name using basepath in exec.py by @dineshyv in https://github.com/meta-llama/llama-stack/pull/1301
 
-### New Contributors
+## New Contributors
 * @Shreyanand made their first contribution in https://github.com/meta-llama/llama-stack/pull/1283
 * @luis5tb made their first contribution in https://github.com/meta-llama/llama-stack/pull/1269
 
 **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.4...v0.1.5
 
-## v0.1.4
+---
+
+# v0.1.4
+Published on: 2025-02-25T00:02:43Z
+
+## v0.1.4 Release Notes
+Here are the key changes coming as part of this release:
 
 ### Build and Test Agents
 * Inference: Added support for non-llama models
@@ -114,20 +125,18 @@
 * Embedding: Updated /inference/embeddings to support asymmetric models, truncation and variable sized outputs
 * Embedding: Updated embedding models for Ollama, Together, and Fireworks with available defaults
 * VectorIO: Improved performance of sqlite-vec using chunked writes
-
 ### Agent Evals and Model Customization
 * Deprecated api /eval-tasks. Use /eval/benchmark  instead
 * Added CPU training support for TorchTune
-
 ### Deploy and Monitoring of Agents
 * Consistent view of client and server tool calls in telemetry
-
 ### Better Engineering
 * Made tests more data-driven for consistent evaluation
 * Fixed documentation links and improved API reference generation
 * Various small fixes for build scripts and system reliability
 
-### What's Changed
+
+## What's Changed
 * build: resync uv and deps on 0.1.3 by @leseb in https://github.com/meta-llama/llama-stack/pull/1108
 * style: fix the capitalization issue by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1117
 * feat: log start, complete time to Agent steps by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1116
@@ -203,7 +212,7 @@
 * fix: set default tool_prompt_format in inference api by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1214
 * test: fix test_tool_choice by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1234
 
-### New Contributors
+## New Contributors
 * @fulvius31 made their first contribution in https://github.com/meta-llama/llama-stack/pull/1114
 * @shrinitg made their first contribution in https://github.com/meta-llama/llama-stack/pull/543
 * @raspawar made their first contribution in https://github.com/meta-llama/llama-stack/pull/1174
@@ -213,7 +222,14 @@
 
 **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.3...v0.1.4
 
-## v0.1.3
+---
+
+# v0.1.3
+Published on: 2025-02-14T20:24:32Z
+
+## v0.1.3 Release
+
+Here are some key changes that are coming as part of this release.
 
 ### Build and Test Agents
 Streamlined the initial development experience
@@ -243,7 +259,7 @@ Infrastructure and code quality improvements
 - Added conventional commits standard
 - Fixed documentation parsing issues
 
-### What's Changed
+## What's Changed
 * Getting started notebook update by @jeffxtang in https://github.com/meta-llama/llama-stack/pull/936
 * docs: update index.md for 0.1.2 by @raghotham in https://github.com/meta-llama/llama-stack/pull/1013
 * test: Make text-based chat completion tests run 10x faster by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1016
@@ -300,7 +316,7 @@ Infrastructure and code quality improvements
 * fix: improve stack build on venv by @leseb in https://github.com/meta-llama/llama-stack/pull/980
 * fix: remove the empty line by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1097
 
-### New Contributors
+## New Contributors
 * @MichaelClifford made their first contribution in https://github.com/meta-llama/llama-stack/pull/1009
 * @ellistarn made their first contribution in https://github.com/meta-llama/llama-stack/pull/1035
 * @kelbrown20 made their first contribution in https://github.com/meta-llama/llama-stack/pull/992
@@ -311,9 +327,12 @@ Infrastructure and code quality improvements
 
 **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.2...v0.1.3
 
-## v0.1.2
+---
 
-### TL;DR
+# v0.1.2
+Published on: 2025-02-07T22:06:49Z
+
+# TL;DR
 - Several stabilizations to development flows after the switch to `uv`
 - Migrated CI workflows to new OSS repo - [llama-stack-ops](https://github.com/meta-llama/llama-stack-ops)
 - Added automated rebuilds for ReadTheDocs
@@ -321,7 +340,7 @@ Infrastructure and code quality improvements
 - Added system prompt overrides support
 - Several bug fixes and improvements to documentation (check out Kubernetes deployment guide by @terrytangyuan )
 
-### What's Changed
+## What's Changed
 * Fix UBI9 image build when installing Python packages via uv by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/926
 * Fix precommit check after moving to ruff by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/927
 * LocalInferenceImpl update for LS 0.1 by @jeffxtang in https://github.com/meta-llama/llama-stack/pull/911
@@ -373,7 +392,7 @@ Infrastructure and code quality improvements
 * fix: Ensure a better error stack trace when llama-stack is not built by @cdoern in https://github.com/meta-llama/llama-stack/pull/950
 * refactor(ollama): model availability check by @leseb in https://github.com/meta-llama/llama-stack/pull/986
 
-### New Contributors
+## New Contributors
 * @nathan-weinberg made their first contribution in https://github.com/meta-llama/llama-stack/pull/939
 * @cdoern made their first contribution in https://github.com/meta-llama/llama-stack/pull/954
 * @jwm4 made their first contribution in https://github.com/meta-llama/llama-stack/pull/957
@@ -386,11 +405,14 @@ Infrastructure and code quality improvements
 
 **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.1...v0.1.2
 
-## v0.1.1
+---
+
+# v0.1.1
+Published on: 2025-02-02T02:29:24Z
 
 A bunch of small / big improvements everywhere including support for Windows, switching to `uv` and many provider improvements.
 
-### What's Changed
+## What's Changed
 * Update doc templates for running safety on self-hosted templates by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/874
 * Update GH action so it correctly queries for test.pypi, etc. by @ashwinb in https://github.com/meta-llama/llama-stack/pull/875
 * Fix report generation for url endpoints by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/876
@@ -430,7 +452,7 @@ A bunch of small / big improvements everywhere including support for Windows, sw
 * Use `uv pip install` instead of `pip install` by @ashwinb in https://github.com/meta-llama/llama-stack/pull/921
 * add image support to NVIDIA inference provider by @mattf in https://github.com/meta-llama/llama-stack/pull/907
 
-### New Contributors
+## New Contributors
 * @BakungaBronson made their first contribution in https://github.com/meta-llama/llama-stack/pull/877
 * @Ckhanoyan made their first contribution in https://github.com/meta-llama/llama-stack/pull/888
 * @hanzlfs made their first contribution in https://github.com/meta-llama/llama-stack/pull/660
@@ -438,24 +460,27 @@ A bunch of small / big improvements everywhere including support for Windows, sw
 
 **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.0...v0.1.1
 
-## v0.1.0
+---
+
+# v0.1.0
+Published on: 2025-01-24T17:47:47Z
 
 We are excited to announce a stable API release of Llama Stack, which enables developers to build RAG applications and Agents using tools and safety shields, monitor and those agents with telemetry, and evaluate the agent with scoring functions.
 
-### Context
+## Context
 GenAI application developers need more than just an LLM - they need to integrate tools, connect with their data sources, establish guardrails, and ground the LLM responses effectively. Currently, developers must piece together various tools and APIs, complicating the development lifecycle and increasing costs. The result is that developers are spending more time on these integrations rather than focusing on the application logic itself. The bespoke coupling of components also makes it challenging to adopt state-of-the-art solutions in the rapidly evolving GenAI space. This is particularly difficult for open models like Llama, as best practices are not widely established in the open.
 
 Llama Stack was created to provide developers with a comprehensive and coherent interface that simplifies AI application development and codifies best practices across the Llama ecosystem. Since our launch in September 2024, we have seen a huge uptick in interest in Llama Stack APIs by both AI developers and from partners building AI services with Llama models. Partners like Nvidia, Fireworks, and Ollama have collaborated with us to develop implementations across various APIs, including inference, memory, and safety.
 
 With Llama Stack, you can easily build a RAG agent which can also search the web, do complex math, and custom tool calling. You can use telemetry to inspect those traces, and convert telemetry into evals datasets. And with Llama Stack’s plugin architecture and prepackage distributions, you choose to run your agent anywhere - in the cloud with our partners, deploy your own environment using virtualenv, conda, or Docker, operate locally with Ollama, or even run on mobile devices with our SDKs. Llama Stack offers unprecedented flexibility while also simplifying the developer experience.
 
-### Release
+## Release
 After iterating on the APIs for the last 3 months, today we’re launching a stable release (V1) of the Llama Stack APIs and the corresponding llama-stack server and client packages(v0.1.0). We now have automated tests for providers. These tests make sure that all provider implementations are verified. Developers can now easily and reliably select distributions or providers based on their specific requirements.
 
 There are example standalone apps in llama-stack-apps.
 
 
-### Key Features of this release
+## Key Features of this release
 
 - **Unified API Layer**
   - Inference: Run LLM models
@@ -490,6 +515,7 @@ There are example standalone apps in llama-stack-apps.
   - iOS
   - Android
 
+
 ### What's Changed
 * [4/n][torchtune integration] support lazy load model during inference by @SLR722 in https://github.com/meta-llama/llama-stack/pull/620
 * remove unused telemetry related code for console by @dineshyv in https://github.com/meta-llama/llama-stack/pull/659
@@ -650,7 +676,7 @@ There are example standalone apps in llama-stack-apps.
 * remove logger handler only in notebook by @dineshyv in https://github.com/meta-llama/llama-stack/pull/868
 * Update 'first RAG agent' in gettingstarted doc by @ehhuang in https://github.com/meta-llama/llama-stack/pull/867
 
-### New Contributors
+## New Contributors
 * @cdgamarose-nv made their first contribution in https://github.com/meta-llama/llama-stack/pull/661
 * @eltociear made their first contribution in https://github.com/meta-llama/llama-stack/pull/675
 * @derekslager made their first contribution in https://github.com/meta-llama/llama-stack/pull/692
@@ -663,9 +689,12 @@ There are example standalone apps in llama-stack-apps.
 
 **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.63...v0.1.0
 
-## v0.1.0rc12
+---
 
-### What's Changed
+# v0.1.0rc12
+Published on: 2025-01-22T22:24:01Z
+
+## What's Changed
 * [4/n][torchtune integration] support lazy load model during inference by @SLR722 in https://github.com/meta-llama/llama-stack/pull/620
 * remove unused telemetry related code for console by @dineshyv in https://github.com/meta-llama/llama-stack/pull/659
 * Fix Meta reference GPU implementation by @ashwinb in https://github.com/meta-llama/llama-stack/pull/663
@@ -800,7 +829,7 @@ There are example standalone apps in llama-stack-apps.
 * Fix fireworks client sdk chat completion with images by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/840
 * [inference api] modify content types so they follow a more standard structure by @ashwinb in https://github.com/meta-llama/llama-stack/pull/841
 
-### New Contributors
+## New Contributors
 * @cdgamarose-nv made their first contribution in https://github.com/meta-llama/llama-stack/pull/661
 * @eltociear made their first contribution in https://github.com/meta-llama/llama-stack/pull/675
 * @derekslager made their first contribution in https://github.com/meta-llama/llama-stack/pull/692
@@ -810,15 +839,21 @@ There are example standalone apps in llama-stack-apps.
 
 **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.63...v0.1.0rc11
 
-## v0.0.63
+---
+
+# v0.0.63
+Published on: 2024-12-18T07:17:43Z
 
 A small but important bug-fix release to update the URL datatype for the client-SDKs. The issue affected multimodal agentic turns especially.
 
 **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.62...v0.0.63
 
-## v0.0.62
+---
 
-### What's Changed
+# v0.0.62
+Published on: 2024-12-18T02:39:43Z
+
+## What's Changed
 
 A few important updates some of which are backwards incompatible. You must update your `run.yaml`s when upgrading. As always look to `templates/<distro>/run.yaml` for reference.
 
@@ -838,15 +873,18 @@ A variety of fixes and enhancements. Some selected ones:
 * [tests] add client-sdk pytests & delete client.py by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/638
 * [bugfix] no shield_call when there's no shields configured by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/642
 
-### New Contributors
+## New Contributors
 * @SLR722 made their first contribution in https://github.com/meta-llama/llama-stack/pull/540
 * @iamarunbrahma made their first contribution in https://github.com/meta-llama/llama-stack/pull/636
 
 **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.61...v0.0.62
 
-## v0.0.61
+---
 
-### What's Changed
+# v0.0.61
+Published on: 2024-12-10T20:50:33Z
+
+## What's Changed
 * add NVIDIA NIM inference adapter by @mattf in https://github.com/meta-llama/llama-stack/pull/355
 * Tgi fixture by @dineshyv in https://github.com/meta-llama/llama-stack/pull/519
 * fixes tests & move braintrust api_keys to request headers by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/535
@@ -888,7 +926,7 @@ A variety of fixes and enhancements. Some selected ones:
 * Fixes for library client by @ashwinb in https://github.com/meta-llama/llama-stack/pull/587
 * Fix issue 586 by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/594
 
-### New Contributors
+## New Contributors
 * @sablair made their first contribution in https://github.com/meta-llama/llama-stack/pull/549
 * @JeffreyLind3 made their first contribution in https://github.com/meta-llama/llama-stack/pull/547
 * @aidando73 made their first contribution in https://github.com/meta-llama/llama-stack/pull/554
@@ -899,9 +937,12 @@ A variety of fixes and enhancements. Some selected ones:
 
 **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.55...v0.0.61
 
-## v0.0.55
+---
 
-### What's Changed
+# v0.0.55
+Published on: 2024-11-23T17:14:07Z
+
+## What's Changed
 * Fix TGI inference adapter
 * Fix `llama stack build` in 0.0.54 by @dltn in https://github.com/meta-llama/llama-stack/pull/505
 * Several documentation related improvements
@@ -910,9 +951,12 @@ A variety of fixes and enhancements. Some selected ones:
 
 **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.54...v0.0.55
 
-## v0.0.54
+---
 
-### What's Changed
+# v0.0.54
+Published on: 2024-11-22T00:36:09Z
+
+## What's Changed
 * Bugfixes release on top of 0.0.53
 * Don't depend on templates.py when print llama stack build messages by @ashwinb in https://github.com/meta-llama/llama-stack/pull/496
 * Restructure docs by @dineshyv in https://github.com/meta-llama/llama-stack/pull/494
@@ -920,12 +964,15 @@ A variety of fixes and enhancements. Some selected ones:
 * Fix fp8 quantization script. by @liyunlu0618 in https://github.com/meta-llama/llama-stack/pull/500
 * use logging instead of prints by @dineshyv in https://github.com/meta-llama/llama-stack/pull/499
 
-### New Contributors
+## New Contributors
 * @liyunlu0618 made their first contribution in https://github.com/meta-llama/llama-stack/pull/500
 
 **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.53...v0.0.54
 
-## v0.0.53
+---
+
+# v0.0.53
+Published on: 2024-11-20T22:18:00Z
 
 🚀  Initial Release Notes for Llama Stack!
 
@@ -961,7 +1008,7 @@ A variety of fixes and enhancements. Some selected ones:
 ### Removed
 - `llama stack configure` command
 
-### What's Changed
+## What's Changed
 * Update download command by @Wauplin in https://github.com/meta-llama/llama-stack/pull/9
 * Update fbgemm version by @jianyuh in https://github.com/meta-llama/llama-stack/pull/12
 * Add CLI reference docs by @dltn in https://github.com/meta-llama/llama-stack/pull/14
@@ -1187,7 +1234,7 @@ A variety of fixes and enhancements. Some selected ones:
 * register with provider even if present in stack by @dineshyv in https://github.com/meta-llama/llama-stack/pull/491
 * Make run yaml optional so dockers can start with just --env by @ashwinb in https://github.com/meta-llama/llama-stack/pull/492
 
-### New Contributors
+## New Contributors
 * @Wauplin made their first contribution in https://github.com/meta-llama/llama-stack/pull/9
 * @jianyuh made their first contribution in https://github.com/meta-llama/llama-stack/pull/12
 * @dltn made their first contribution in https://github.com/meta-llama/llama-stack/pull/14
@@ -1240,3 +1287,5 @@ A variety of fixes and enhancements. Some selected ones:
 * @iseeyuan made their first contribution in https://github.com/meta-llama/llama-stack/pull/485
 
 **Full Changelog**: https://github.com/meta-llama/llama-stack/commits/v0.0.53
+
+---
diff --git a/scripts/gen-changelog.py b/scripts/gen-changelog.py
new file mode 100644
index 000000000..3d5197e03
--- /dev/null
+++ b/scripts/gen-changelog.py
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import requests
+import os
+
+def get_all_releases(token):
+    url = f"https://api.github.com/repos/meta-llama/llama-stack/releases"
+    headers = {"Accept": "application/vnd.github.v3+json"}
+    
+    if token:
+        headers["Authorization"] = f"token {token}"
+    
+    response = requests.get(url, headers=headers)
+    
+    if response.status_code == 200:
+        return response.json()
+    else:
+        raise Exception(f"Error fetching releases: {response.status_code}, {response.text}")
+
+
+def merge_release_notes(output_file, token=None):
+    releases = get_all_releases(token)
+    
+    with open(output_file, "w", encoding="utf-8") as md_file:
+        md_file.write(f"# Changelog\n\n")
+        
+        for release in releases:
+            md_file.write(f"# {release['tag_name']}\n")
+            md_file.write(f"Published on: {release['published_at']}\n\n")
+            md_file.write(f"{release['body']}\n\n")
+            md_file.write("---\n\n")
+    
+    print(f"Merged release notes saved to {output_file}")
+
+if __name__ == "__main__":
+    OUTPUT_FILE = "CHANGELOG.md"
+    TOKEN = os.getenv("GITHUB_TOKEN")
+    merge_release_notes(OUTPUT_FILE, TOKEN)

From 60e7f3d705d9a61fc82695d045c7330102d81e40 Mon Sep 17 00:00:00 2001
From: Dinesh Yeduguru <yvdinesh@gmail.com>
Date: Fri, 7 Mar 2025 10:16:47 -0800
Subject: [PATCH 02/58] fix: Revert "feat: record token usage for inference API
 (#1300)" (#1476)

This reverts commit b8535417e0f9986b096c24d6811689b11c17d7ae.

Test plan:
LLAMA_STACK_DISABLE_VERSION_CHECK=true llama stack run
~/.llama/distributions/together/together-run.yaml
python -m examples.agents.e2e_loop_with_client_tools localhost 8321
---
 llama_stack/apis/inference/inference.py       |   8 +-
 llama_stack/distribution/resolver.py          |   4 +-
 llama_stack/distribution/routers/__init__.py  |  12 +-
 llama_stack/distribution/routers/routers.py   | 148 +-----------------
 .../telemetry/meta_reference/telemetry.py     |   3 -
 5 files changed, 14 insertions(+), 161 deletions(-)

diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index fa917ac22..d0f5d15c5 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -285,7 +285,7 @@ class CompletionRequest(BaseModel):
 
 
 @json_schema_type
-class CompletionResponse(MetricResponseMixin):
+class CompletionResponse(BaseModel):
     """Response from a completion request.
 
     :param content: The generated completion text
@@ -299,7 +299,7 @@ class CompletionResponse(MetricResponseMixin):
 
 
 @json_schema_type
-class CompletionResponseStreamChunk(MetricResponseMixin):
+class CompletionResponseStreamChunk(BaseModel):
     """A chunk of a streamed completion response.
 
     :param delta: New content generated since last chunk. This can be one or more tokens.
@@ -368,7 +368,7 @@ class ChatCompletionRequest(BaseModel):
 
 
 @json_schema_type
-class ChatCompletionResponseStreamChunk(MetricResponseMixin):
+class ChatCompletionResponseStreamChunk(MetricResponseMixin, BaseModel):
     """A chunk of a streamed chat completion response.
 
     :param event: The event containing the new content
@@ -378,7 +378,7 @@ class ChatCompletionResponseStreamChunk(MetricResponseMixin):
 
 
 @json_schema_type
-class ChatCompletionResponse(MetricResponseMixin):
+class ChatCompletionResponse(MetricResponseMixin, BaseModel):
     """Response from a chat completion request.
 
     :param completion_message: The complete response message
diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py
index 624a4f2c2..c24df384d 100644
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@@ -163,9 +163,7 @@ def specs_for_autorouted_apis(apis_to_serve: List[str] | Set[str]) -> Dict[str,
                     module="llama_stack.distribution.routers",
                     routing_table_api=info.routing_table_api,
                     api_dependencies=[info.routing_table_api],
-                    # Add telemetry as an optional dependency to all auto-routed providers
-                    optional_api_dependencies=[Api.telemetry],
-                    deps__=([info.routing_table_api.value, Api.telemetry.value]),
+                    deps__=[info.routing_table_api.value],
                 ),
             )
         }
diff --git a/llama_stack/distribution/routers/__init__.py b/llama_stack/distribution/routers/__init__.py
index d0fca8771..a54f57fb3 100644
--- a/llama_stack/distribution/routers/__init__.py
+++ b/llama_stack/distribution/routers/__init__.py
@@ -45,7 +45,7 @@ async def get_routing_table_impl(
     return impl
 
 
-async def get_auto_router_impl(api: Api, routing_table: RoutingTable, deps: Dict[str, Any]) -> Any:
+async def get_auto_router_impl(api: Api, routing_table: RoutingTable, _deps) -> Any:
     from .routers import (
         DatasetIORouter,
         EvalRouter,
@@ -65,17 +65,9 @@ async def get_auto_router_impl(api: Api, routing_table: RoutingTable, deps: Dict
         "eval": EvalRouter,
         "tool_runtime": ToolRuntimeRouter,
     }
-    api_to_deps = {
-        "inference": {"telemetry": Api.telemetry},
-    }
     if api.value not in api_to_routers:
         raise ValueError(f"API {api.value} not found in router map")
 
-    api_to_dep_impl = {}
-    for dep_name, dep_api in api_to_deps.get(api.value, {}).items():
-        if dep_api in deps:
-            api_to_dep_impl[dep_name] = deps[dep_api]
-
-    impl = api_to_routers[api.value](routing_table, **api_to_dep_impl)
+    impl = api_to_routers[api.value](routing_table)
     await impl.initialize()
     return impl
diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py
index 3cfc2b119..f2c70e66f 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@@ -4,8 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import time
-from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
+from typing import Any, AsyncGenerator, Dict, List, Optional
 
 from llama_stack import logcat
 from llama_stack.apis.common.content_types import (
@@ -22,10 +21,6 @@ from llama_stack.apis.eval import (
     JobStatus,
 )
 from llama_stack.apis.inference import (
-    ChatCompletionResponse,
-    ChatCompletionResponseEventType,
-    ChatCompletionResponseStreamChunk,
-    CompletionMessage,
     EmbeddingsResponse,
     EmbeddingTaskType,
     Inference,
@@ -33,14 +28,13 @@ from llama_stack.apis.inference import (
     Message,
     ResponseFormat,
     SamplingParams,
-    StopReason,
     TextTruncation,
     ToolChoice,
     ToolConfig,
     ToolDefinition,
     ToolPromptFormat,
 )
-from llama_stack.apis.models import Model, ModelType
+from llama_stack.apis.models import ModelType
 from llama_stack.apis.safety import RunShieldResponse, Safety
 from llama_stack.apis.scoring import (
     ScoreBatchResponse,
@@ -49,7 +43,6 @@ from llama_stack.apis.scoring import (
     ScoringFnParams,
 )
 from llama_stack.apis.shields import Shield
-from llama_stack.apis.telemetry import MetricEvent, Telemetry
 from llama_stack.apis.tools import (
     RAGDocument,
     RAGQueryConfig,
@@ -59,10 +52,7 @@ from llama_stack.apis.tools import (
     ToolRuntime,
 )
 from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
-from llama_stack.models.llama.llama3.chat_format import ChatFormat
-from llama_stack.models.llama.llama3.tokenizer import Tokenizer
 from llama_stack.providers.datatypes import RoutingTable
-from llama_stack.providers.utils.telemetry.tracing import get_current_span
 
 
 class VectorIORouter(VectorIO):
@@ -131,14 +121,9 @@ class InferenceRouter(Inference):
     def __init__(
         self,
         routing_table: RoutingTable,
-        telemetry: Optional[Telemetry] = None,
     ) -> None:
         logcat.debug("core", "Initializing InferenceRouter")
         self.routing_table = routing_table
-        self.telemetry = telemetry
-        if self.telemetry:
-            self.tokenizer = Tokenizer.get_instance()
-            self.formatter = ChatFormat(self.tokenizer)
 
     async def initialize(self) -> None:
         logcat.debug("core", "InferenceRouter.initialize")
@@ -162,57 +147,6 @@ class InferenceRouter(Inference):
         )
         await self.routing_table.register_model(model_id, provider_model_id, provider_id, metadata, model_type)
 
-    def _construct_metrics(
-        self, prompt_tokens: int, completion_tokens: int, total_tokens: int, model: Model
-    ) -> List[MetricEvent]:
-        span = get_current_span()
-        metrics = [
-            ("prompt_tokens", prompt_tokens),
-            ("completion_tokens", completion_tokens),
-            ("total_tokens", total_tokens),
-        ]
-        metric_events = []
-        for metric_name, value in metrics:
-            metric_events.append(
-                MetricEvent(
-                    trace_id=span.trace_id,
-                    span_id=span.span_id,
-                    metric=metric_name,
-                    value=value,
-                    timestamp=time.time(),
-                    unit="tokens",
-                    attributes={
-                        "model_id": model.model_id,
-                        "provider_id": model.provider_id,
-                    },
-                )
-            )
-        return metric_events
-
-    async def _compute_and_log_token_usage(
-        self,
-        prompt_tokens: int,
-        completion_tokens: int,
-        total_tokens: int,
-        model: Model,
-    ) -> List[MetricEvent]:
-        metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model)
-        if self.telemetry:
-            for metric in metrics:
-                await self.telemetry.log_event(metric)
-        return metrics
-
-    async def _count_tokens(
-        self,
-        messages: List[Message] | InterleavedContent,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-    ) -> Optional[int]:
-        if isinstance(messages, list):
-            encoded = self.formatter.encode_dialog_prompt(messages, tool_prompt_format)
-        else:
-            encoded = self.formatter.encode_content(messages)
-        return len(encoded.tokens) if encoded and encoded.tokens else 0
-
     async def chat_completion(
         self,
         model_id: str,
@@ -225,7 +159,7 @@ class InferenceRouter(Inference):
         stream: Optional[bool] = False,
         logprobs: Optional[LogProbConfig] = None,
         tool_config: Optional[ToolConfig] = None,
-    ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
+    ) -> AsyncGenerator:
         logcat.debug(
             "core",
             f"InferenceRouter.chat_completion: {model_id=}, {stream=}, {messages=}, {tools=}, {tool_config=}, {response_format=}",
@@ -276,47 +210,10 @@ class InferenceRouter(Inference):
             tool_config=tool_config,
         )
         provider = self.routing_table.get_provider_impl(model_id)
-        prompt_tokens = await self._count_tokens(messages, tool_config.tool_prompt_format)
-
         if stream:
-
-            async def stream_generator():
-                completion_text = ""
-                async for chunk in await provider.chat_completion(**params):
-                    if chunk.event.event_type == ChatCompletionResponseEventType.progress:
-                        if chunk.event.delta.type == "text":
-                            completion_text += chunk.event.delta.text
-                    if chunk.event.event_type == ChatCompletionResponseEventType.complete:
-                        completion_tokens = await self._count_tokens(
-                            [CompletionMessage(content=completion_text, stop_reason=StopReason.end_of_turn)],
-                            tool_config.tool_prompt_format,
-                        )
-                        total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
-                        metrics = await self._compute_and_log_token_usage(
-                            prompt_tokens or 0,
-                            completion_tokens or 0,
-                            total_tokens,
-                            model,
-                        )
-                        chunk.metrics = metrics if chunk.metrics is None else chunk.metrics + metrics
-                    yield chunk
-
-            return stream_generator()
+            return (chunk async for chunk in await provider.chat_completion(**params))
         else:
-            response = await provider.chat_completion(**params)
-            completion_tokens = await self._count_tokens(
-                [response.completion_message],
-                tool_config.tool_prompt_format,
-            )
-            total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
-            metrics = await self._compute_and_log_token_usage(
-                prompt_tokens or 0,
-                completion_tokens or 0,
-                total_tokens,
-                model,
-            )
-            response.metrics = metrics if response.metrics is None else response.metrics + metrics
-            return response
+            return await provider.chat_completion(**params)
 
     async def completion(
         self,
@@ -347,41 +244,10 @@ class InferenceRouter(Inference):
             stream=stream,
             logprobs=logprobs,
         )
-
-        prompt_tokens = await self._count_tokens(content)
-
         if stream:
-
-            async def stream_generator():
-                completion_text = ""
-                async for chunk in await provider.completion(**params):
-                    if hasattr(chunk, "delta"):
-                        completion_text += chunk.delta
-                    if hasattr(chunk, "stop_reason") and chunk.stop_reason and self.telemetry:
-                        completion_tokens = await self._count_tokens(completion_text)
-                        total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
-                        metrics = await self._compute_and_log_token_usage(
-                            prompt_tokens or 0,
-                            completion_tokens or 0,
-                            total_tokens,
-                            model,
-                        )
-                        chunk.metrics = metrics if chunk.metrics is None else chunk.metrics + metrics
-                    yield chunk
-
-            return stream_generator()
+            return (chunk async for chunk in await provider.completion(**params))
         else:
-            response = await provider.completion(**params)
-            completion_tokens = await self._count_tokens(response.content)
-            total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
-            metrics = await self._compute_and_log_token_usage(
-                prompt_tokens or 0,
-                completion_tokens or 0,
-                total_tokens,
-                model,
-            )
-            response.metrics = metrics if response.metrics is None else response.metrics + metrics
-            return response
+            return await provider.completion(**params)
 
     async def embeddings(
         self,
diff --git a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
index 4cdb420b2..e713a057f 100644
--- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
@@ -73,7 +73,6 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
     def __init__(self, config: TelemetryConfig, deps: Dict[str, Any]) -> None:
         self.config = config
         self.datasetio_api = deps.get(Api.datasetio)
-        self.meter = None
 
         resource = Resource.create(
             {
@@ -172,8 +171,6 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
         return _GLOBAL_STORAGE["gauges"][name]
 
     def _log_metric(self, event: MetricEvent) -> None:
-        if self.meter is None:
-            return
         if isinstance(event.value, int):
             counter = self._get_or_create_counter(event.metric, event.unit)
             counter.add(event.value, attributes=event.attributes)

From 290cc843fc68e97cbfa4aec6745707f82a53dd25 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Fri, 7 Mar 2025 10:20:51 -0800
Subject: [PATCH 03/58] test: first unit test for resolver (#1475)

Starting to create unit tests to cover critical (and mostly
undocumented) provider resolution and routing logic.

## Test Plan

Unit tests
---
 .cursor/rules/general.mdc          |   9 +++
 tests/unit/server/test_resolver.py | 117 +++++++++++++++++++++++++++++
 2 files changed, 126 insertions(+)
 create mode 100644 .cursor/rules/general.mdc
 create mode 100644 tests/unit/server/test_resolver.py

diff --git a/.cursor/rules/general.mdc b/.cursor/rules/general.mdc
new file mode 100644
index 000000000..24daef2ba
--- /dev/null
+++ b/.cursor/rules/general.mdc
@@ -0,0 +1,9 @@
+---
+description: General rules always applicable across the project
+globs:
+alwaysApply: true
+---
+# Style
+
+- Comments must add value to code. Don't write filler comments explaining what you are doing next; they just add noise.
+- Add a comment to clarify surprising behavior which would not be obvious. Good variable naming and clear code organization is more important.
diff --git a/tests/unit/server/test_resolver.py b/tests/unit/server/test_resolver.py
new file mode 100644
index 000000000..fcf0b3945
--- /dev/null
+++ b/tests/unit/server/test_resolver.py
@@ -0,0 +1,117 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import inspect
+import sys
+from typing import Any, Dict, Protocol
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+from pydantic import BaseModel, Field
+
+from llama_stack.apis.inference import Inference
+from llama_stack.distribution.datatypes import (
+    Api,
+    Provider,
+    StackRunConfig,
+)
+from llama_stack.distribution.resolver import resolve_impls
+from llama_stack.distribution.routers.routers import InferenceRouter
+from llama_stack.distribution.routers.routing_tables import ModelsRoutingTable
+from llama_stack.providers.datatypes import InlineProviderSpec, ProviderSpec
+
+
+def add_protocol_methods(cls: type, protocol: type[Protocol]) -> None:
+    """Dynamically add protocol methods to a class by inspecting the protocol."""
+    for name, value in inspect.getmembers(protocol):
+        if inspect.isfunction(value) and hasattr(value, "__webmethod__"):
+            # Get the signature
+            sig = inspect.signature(value)
+
+            # Create an async function with the same signature that returns a MagicMock
+            async def mock_impl(*args, **kwargs):
+                return MagicMock()
+
+            # Set the signature on our mock implementation
+            mock_impl.__signature__ = sig
+            # Add it to the class
+            setattr(cls, name, mock_impl)
+
+
+class SampleConfig(BaseModel):
+    foo: str = Field(
+        default="bar",
+        description="foo",
+    )
+
+    @classmethod
+    def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            "foo": "baz",
+        }
+
+
+class SampleImpl:
+    def __init__(self, config: SampleConfig, deps: Dict[Api, Any], provider_spec: ProviderSpec = None):
+        self.__provider_id__ = "test_provider"
+        self.__provider_spec__ = provider_spec
+        self.__provider_config__ = config
+        self.__deps__ = deps
+        self.foo = config.foo
+
+    async def initialize(self):
+        pass
+
+
+@pytest.mark.asyncio
+async def test_resolve_impls_basic():
+    # Create a real provider spec
+    provider_spec = InlineProviderSpec(
+        api=Api.inference,
+        provider_type="sample",
+        module="test_module",
+        config_class="test_resolver.SampleConfig",
+        api_dependencies=[],
+    )
+
+    # Create provider registry with our provider
+    provider_registry = {Api.inference: {provider_spec.provider_type: provider_spec}}
+
+    run_config = StackRunConfig(
+        image_name="test_image",
+        providers={
+            "inference": [
+                Provider(
+                    provider_id="sample_provider",
+                    provider_type="sample",
+                    config=SampleConfig.sample_run_config(),
+                )
+            ]
+        },
+    )
+
+    dist_registry = MagicMock()
+
+    mock_module = MagicMock()
+    impl = SampleImpl(SampleConfig(foo="baz"), {}, provider_spec)
+    add_protocol_methods(SampleImpl, Inference)
+
+    mock_module.get_provider_impl = AsyncMock(return_value=impl)
+    sys.modules["test_module"] = mock_module
+
+    impls = await resolve_impls(run_config, provider_registry, dist_registry)
+
+    assert Api.inference in impls
+    assert isinstance(impls[Api.inference], InferenceRouter)
+
+    table = impls[Api.inference].routing_table
+    assert isinstance(table, ModelsRoutingTable)
+
+    impl = table.impls_by_provider_id["sample_provider"]
+    assert isinstance(impl, SampleImpl)
+    assert impl.foo == "baz"
+    assert impl.__provider_id__ == "sample_provider"
+    assert impl.__provider_spec__ == provider_spec

From 4dccf916d1e6ae80c1e6bdf7f516c64848433928 Mon Sep 17 00:00:00 2001
From: Botao Chen <markchen1015@meta.com>
Date: Fri, 7 Mar 2025 10:37:55 -0800
Subject: [PATCH 04/58] feat: open benchmark template and doc (#1465)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What does this PR do?
- Provide a distro template to let developer easily run the open
benchmarks llama stack supports on llama and non-llama models.
- Provide doc on how to run open benchmark eval via CLI and open
benchmark contributing guide

[//]: # (If resolving an issue, uncomment and update the line below)
(Closes #1375 )

## Test Plan
open benchmark eval results on llama, gpt, gemini and clause
<img width="771" alt="Screenshot 2025-03-06 at 7 33 05 PM"
src="https://github.com/user-attachments/assets/1bd85456-b9b9-4b37-af76-4ce1d2bac00e"
/>

doc preview
<img width="944" alt="Screenshot 2025-03-06 at 7 33 58 PM"
src="https://github.com/user-attachments/assets/f4e5866d-b395-4c40-aa8b-080edeb5cdb6"
/>
<img width="955" alt="Screenshot 2025-03-06 at 7 34 04 PM"
src="https://github.com/user-attachments/assets/629defb6-d5e4-473c-aa03-308bce386fb4"
/>

<img width="965" alt="Screenshot 2025-03-06 at 7 35 29 PM"
src="https://github.com/user-attachments/assets/c21ff96c-9e8c-4c54-b6b8-25883125f4cf"
/>

<img width="957" alt="Screenshot 2025-03-06 at 7 35 37 PM"
src="https://github.com/user-attachments/assets/47571c90-1381-4e2c-bbed-c4f3a60578d0"
/>
---
 distributions/dependencies.json               |  36 +++
 docs/source/concepts/evaluation_concepts.md   |  50 +++++
 .../references/evals_reference/index.md       |  76 ++++++-
 .../templates/open-benchmark/__init__.py      |   7 +
 .../templates/open-benchmark/build.yaml       |  36 +++
 .../open-benchmark/open_benchmark.py          | 178 +++++++++++++++
 llama_stack/templates/open-benchmark/run.yaml | 212 ++++++++++++++++++
 7 files changed, 585 insertions(+), 10 deletions(-)
 create mode 100644 llama_stack/templates/open-benchmark/__init__.py
 create mode 100644 llama_stack/templates/open-benchmark/build.yaml
 create mode 100644 llama_stack/templates/open-benchmark/open_benchmark.py
 create mode 100644 llama_stack/templates/open-benchmark/run.yaml

diff --git a/distributions/dependencies.json b/distributions/dependencies.json
index 59b0c9e62..5623e251a 100644
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@@ -453,6 +453,42 @@
     "transformers",
     "uvicorn"
   ],
+  "open_benchmark": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "fastapi",
+    "fire",
+    "httpx",
+    "litellm",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pymongo",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "sqlite-vec",
+    "together",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
+  ],
   "remote-vllm": [
     "aiosqlite",
     "autoevals",
diff --git a/docs/source/concepts/evaluation_concepts.md b/docs/source/concepts/evaluation_concepts.md
index eae606712..61a695d9f 100644
--- a/docs/source/concepts/evaluation_concepts.md
+++ b/docs/source/concepts/evaluation_concepts.md
@@ -24,6 +24,56 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo
   - Associated with `Benchmark` resource.
 
 
+## Open-benchmark Eval
+
+### List of open-benchmarks Llama Stack support
+
+Llama stack pre-registers several popular open-benchmarks to easily evaluate model perfomance via CLI.
+
+The list of open-benchmarks we currently support:
+- [MMLU-COT](https://arxiv.org/abs/2009.03300) (Measuring Massive Multitask Language Understanding): Benchmark designed to comprehensively evaluate the breadth and depth of a model's academic and professional understanding
+- [GPQA-COT](https://arxiv.org/abs/2311.12022) (A Graduate-Level Google-Proof Q&A Benchmark): A challenging benchmark of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry.
+- [SimpleQA](https://openai.com/index/introducing-simpleqa/): Benchmark designed to access models to answer short, fact-seeking questions.
+- [MMMU](https://arxiv.org/abs/2311.16502) (A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI)]: Benchmark designed to evaluate multimodal models.
+
+
+You can follow this contributing guidance to add more open-benchmarks to Llama Stack
+
+### Run evaluation on open-benchmarks via CLI
+
+We have built-in functionality to run the supported open-benckmarks using llama-stack-client CLI
+
+#### Spin up Llama Stack server
+
+Spin up llama stack server with 'open-benchmark' template
+```
+llama stack run llama_stack/templates/open-benchmark/run.yaml
+
+```
+
+#### Run eval CLI
+There are 3 necessary inputs to run a benchmark eval
+- `list of benchmark_ids`: The list of benchmark ids to run evaluation on
+- `model-id`: The model id to evaluate on
+- `utput_dir`: Path to store the evaluate results
+```
+llama-stack-client eval run-benchmark <benchmark_id_1> <benchmark_id_2> ... \
+--model_id <model id to evaluate on> \
+--output_dir <directory to store the evaluate results> \
+```
+
+You can run
+```
+llama-stack-client eval run-benchmark help
+```
+to see the description of all the flags that eval run-benchmark has
+
+
+In the output log, you can find the file path that has your evaluation results. Open that file and you can see you aggrgate
+evaluation results over there.
+
+
+
 ## What's Next?
 
 - Check out our Colab notebook on working examples with running benchmark evaluations [here](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb#scrollTo=mxLCsP4MvFqP).
diff --git a/docs/source/references/evals_reference/index.md b/docs/source/references/evals_reference/index.md
index 14ce0bf34..d55537c47 100644
--- a/docs/source/references/evals_reference/index.md
+++ b/docs/source/references/evals_reference/index.md
@@ -275,18 +275,25 @@ response = client.scoring.score(
 The following examples give the quick steps to start running evaluations using the llama-stack-client CLI.
 
 #### Benchmark Evaluation CLI
-Usage: There are 2 inputs necessary for running a benchmark eval
-- `eval-task-id`: the identifier associated with the eval task. Each `Benchmark` is parametrized by
-  - `dataset_id`: the identifier associated with the dataset.
-  - `List[scoring_function_id]`: list of scoring function identifiers.
-- `eval-task-config`: specifies the configuration of the model / agent to evaluate on.
+There are 3 necessary input for running a benchmark eval
+- `list of benchmark_ids`: The list of benchmark ids to run evaluation on
+- `model-id`: The model id to evaluate on
+- `utput_dir`: Path to store the evaluate results
+```
+llama-stack-client eval run-benchmark <benchmark_id_1> <benchmark_id_2> ... \
+--model_id <model id to evaluate on> \
+--output_dir <directory to store the evaluate results> \
+```
+
+You can run
+```
+llama-stack-client eval run-benchmark help
+```
+to see the description of all the flags to run benckmark eval
 
 
-```
-llama-stack-client eval run_benchmark <eval-task-id> \
---eval-task-config ~/benchmark_config.json \
---visualize
-```
+In the output log, you can find the path to the file that has your evaluation results. Open that file and you can see you aggrgate
+evaluation results over there.
 
 
 #### Application Evaluation CLI
@@ -338,3 +345,52 @@ The `BenchmarkConfig` are user specified config to define:
     }
 }
 ```
+
+
+## Open-benchmark Contributing Guide
+
+### Create the new dataset for your new benchmark
+An eval open-benchmark essentially contains 2 parts:
+- `raw data`: The raw dataset associated with the benchmark. You typically need to search the original paper that introduces the benchmark and find the canonical dataset (usually hosted on huggingface)
+- `prompt template`: How to ask the candidate model to generate the answer (prompt template plays a critical role to the evaluation results). Tyically, you can find the reference prompt template associated with the benchmark in benchmarks author's repo ([exmaple](https://github.com/idavidrein/gpqa/blob/main/prompts/chain_of_thought.txt)) or some other popular open source repos ([example](https://github.com/openai/simple-evals/blob/0a6e8f62e52bc5ae915f752466be3af596caf392/common.py#L14))
+
+To create new open-benmark in llama stack, you need to combine the prompt template and the raw data into the `chat_completion_input` column in the evaluation dataset.
+
+Llama stack enforeces the evaluate dataset schema to contain at least 3 columns:
+- `chat_completion_input`: The actual input to the model to run the generation for eval
+- `input_query`: The raw input from the raw dataset without the prompt template
+- `expected_answer`: The ground truth for scoring functions to calcalate the score from.
+
+
+You need to write a script [example convert script](https://gist.github.com/yanxi0830/118e9c560227d27132a7fd10e2c92840) to convert the benchmark raw dataset to llama stack format eval dataset and update the dataset to huggingface [example benchmark dataset](https://huggingface.co/datasets/llamastack/mmmu)
+
+
+### Find scoring function for your new benchmark
+The purpose of scoring function is to calculate the score for each example based on candidate model generation result and expected_answer. It also aggregates the scores from all the examples and generate the final evaluate results.
+
+
+Firstly, you can see if the existing [llama stack scoring functions](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/scoring) can fulfill your need. If not, you need to write a new scoring function based on what benchmark author / other open source repo describe.
+
+### Add new benchmark into template
+Firstly, you need to add the evaluation dataset associated with your benchmark under `datasets` resource in templates/open-benchmark/run.yaml
+
+Secondly, you need to add the new benchmark you just created under the `benchmarks` resource in the same template. To add the new benchmark, you need to have
+- `benchmark_id`: identifier of the benchmark
+- `dataset_id`: identifier of the dataset associated with your benchmark
+- `scoring_functions`: scoring function to calculate the score based on generation results and expected_answer
+
+
+### Test the new benchmark
+
+Spin up llama stack server with 'open-benchmark' templates
+```
+llama stack run llama_stack/templates/open-benchmark/run.yaml
+
+```
+
+Run eval benchmark CLI with your new benchmark id
+```
+llama-stack-client eval run-benchmark <new_benchmark_id> \
+--model_id <model id to evaluate on> \
+--output_dir <directory to store the evaluate results> \
+```
diff --git a/llama_stack/templates/open-benchmark/__init__.py b/llama_stack/templates/open-benchmark/__init__.py
new file mode 100644
index 000000000..14d0a28f5
--- /dev/null
+++ b/llama_stack/templates/open-benchmark/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .open_benchmark import get_distribution_template  # noqa: F401
diff --git a/llama_stack/templates/open-benchmark/build.yaml b/llama_stack/templates/open-benchmark/build.yaml
new file mode 100644
index 000000000..1db90ef27
--- /dev/null
+++ b/llama_stack/templates/open-benchmark/build.yaml
@@ -0,0 +1,36 @@
+version: '2'
+distribution_spec:
+  description: Distribution for running open benchmarks
+  providers:
+    inference:
+    - remote::openai
+    - remote::anthropic
+    - remote::gemini
+    - remote::groq
+    - remote::together
+    vector_io:
+    - inline::sqlite-vec
+    - remote::chromadb
+    - remote::pgvector
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+    eval:
+    - inline::meta-reference
+    datasetio:
+    - remote::huggingface
+    - inline::localfs
+    scoring:
+    - inline::basic
+    - inline::llm-as-judge
+    - inline::braintrust
+    tool_runtime:
+    - remote::brave-search
+    - remote::tavily-search
+    - inline::code-interpreter
+    - inline::rag-runtime
+    - remote::model-context-protocol
+image_type: conda
diff --git a/llama_stack/templates/open-benchmark/open_benchmark.py b/llama_stack/templates/open-benchmark/open_benchmark.py
new file mode 100644
index 000000000..9ef84456e
--- /dev/null
+++ b/llama_stack/templates/open-benchmark/open_benchmark.py
@@ -0,0 +1,178 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List, Tuple
+
+from llama_stack.distribution.datatypes import (
+    ModelInput,
+    Provider,
+    ShieldInput,
+    ToolGroupInput,
+)
+from llama_stack.providers.inline.vector_io.sqlite_vec.config import SQLiteVectorIOConfig
+from llama_stack.providers.remote.inference.anthropic.config import AnthropicConfig
+from llama_stack.providers.remote.inference.anthropic.models import MODEL_ENTRIES as ANTHROPIC_MODEL_ENTRIES
+from llama_stack.providers.remote.inference.gemini.config import GeminiConfig
+from llama_stack.providers.remote.inference.gemini.models import MODEL_ENTRIES as GEMINI_MODEL_ENTRIES
+from llama_stack.providers.remote.inference.groq.config import GroqConfig
+from llama_stack.providers.remote.inference.groq.models import MODEL_ENTRIES as GROQ_MODEL_ENTRIES
+from llama_stack.providers.remote.inference.openai.config import OpenAIConfig
+from llama_stack.providers.remote.inference.openai.models import MODEL_ENTRIES as OPENAI_MODEL_ENTRIES
+from llama_stack.providers.remote.inference.together.config import TogetherImplConfig
+from llama_stack.providers.remote.inference.together.models import MODEL_ENTRIES as TOGETHER_MODEL_ENTRIES
+from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig
+from llama_stack.providers.remote.vector_io.pgvector.config import PGVectorVectorIOConfig
+from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry
+
+
+def get_inference_providers() -> Tuple[List[Provider], List[ModelInput]]:
+    # in this template, we allow each API key to be optional
+    providers = [
+        (
+            "openai",
+            OPENAI_MODEL_ENTRIES,
+            OpenAIConfig.sample_run_config(api_key="${env.OPENAI_API_KEY:}"),
+        ),
+        (
+            "anthropic",
+            ANTHROPIC_MODEL_ENTRIES,
+            AnthropicConfig.sample_run_config(api_key="${env.ANTHROPIC_API_KEY:}"),
+        ),
+        (
+            "gemini",
+            GEMINI_MODEL_ENTRIES,
+            GeminiConfig.sample_run_config(api_key="${env.GEMINI_API_KEY:}"),
+        ),
+        (
+            "groq",
+            GROQ_MODEL_ENTRIES,
+            GroqConfig.sample_run_config(api_key="${env.GROQ_API_KEY:}"),
+        ),
+        (
+            "together",
+            TOGETHER_MODEL_ENTRIES,
+            TogetherImplConfig.sample_run_config(api_key="${env.TOGETHER_API_KEY:}"),
+        ),
+    ]
+    inference_providers = []
+    available_models = {}
+    for provider_id, model_entries, config in providers:
+        inference_providers.append(
+            Provider(
+                provider_id=provider_id,
+                provider_type=f"remote::{provider_id}",
+                config=config,
+            )
+        )
+        available_models[provider_id] = model_entries
+    return inference_providers, available_models
+
+
+def get_distribution_template() -> DistributionTemplate:
+    inference_providers, available_models = get_inference_providers()
+    providers = {
+        "inference": ([p.provider_type for p in inference_providers] + ["inline::sentence-transformers"]),
+        "vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+        "eval": ["inline::meta-reference"],
+        "datasetio": ["remote::huggingface", "inline::localfs"],
+        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
+        "tool_runtime": [
+            "remote::brave-search",
+            "remote::tavily-search",
+            "inline::code-interpreter",
+            "inline::rag-runtime",
+            "remote::model-context-protocol",
+        ],
+    }
+    name = "open_benchmark"
+
+    vector_io_providers = [
+        Provider(
+            provider_id="sqlite-vec",
+            provider_type="inline::sqlite-vec",
+            config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+        ),
+        Provider(
+            provider_id="${env.ENABLE_CHROMADB+chromadb}",
+            provider_type="remote::chromadb",
+            config=ChromaVectorIOConfig.sample_run_config(url="${env.CHROMADB_URL:}"),
+        ),
+        Provider(
+            provider_id="${env.ENABLE_PGVECTOR+pgvector}",
+            provider_type="remote::pgvector",
+            config=PGVectorVectorIOConfig.sample_run_config(
+                db="${env.PGVECTOR_DB:}",
+                user="${env.PGVECTOR_USER:}",
+                password="${env.PGVECTOR_PASSWORD:}",
+            ),
+        ),
+    ]
+
+    default_tool_groups = [
+        ToolGroupInput(
+            toolgroup_id="builtin::websearch",
+            provider_id="tavily-search",
+        ),
+        ToolGroupInput(
+            toolgroup_id="builtin::rag",
+            provider_id="rag-runtime",
+        ),
+        ToolGroupInput(
+            toolgroup_id="builtin::code_interpreter",
+            provider_id="code-interpreter",
+        ),
+    ]
+
+    default_models = get_model_registry(available_models)
+    return DistributionTemplate(
+        name=name,
+        distro_type="self_hosted",
+        description="Distribution for running open benchmarks",
+        container_image=None,
+        template_path=None,
+        providers=providers,
+        available_models_by_provider=available_models,
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": inference_providers,
+                    "vector_io": vector_io_providers,
+                },
+                default_models=default_models,
+                default_tool_groups=default_tool_groups,
+                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
+            ),
+        },
+        run_config_env_vars={
+            "LLAMA_STACK_PORT": (
+                "5001",
+                "Port for the Llama Stack distribution server",
+            ),
+            "OPENAI_API_KEY": (
+                "",
+                "OpenAI API Key",
+            ),
+            "GEMINI_API_KEY": (
+                "",
+                "Gemini API Key",
+            ),
+            "GROQ_API_KEY": (
+                "",
+                "Groq API Key",
+            ),
+            "ANTHROPIC_API_KEY": (
+                "",
+                "Anthropic API Key",
+            ),
+            "TOGETHER_API_KEY": (
+                "",
+                "Together API Key",
+            ),
+        },
+    )
diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml
new file mode 100644
index 000000000..ba495923c
--- /dev/null
+++ b/llama_stack/templates/open-benchmark/run.yaml
@@ -0,0 +1,212 @@
+version: '2'
+image_name: dev
+apis:
+- agents
+- datasetio
+- eval
+- inference
+- safety
+- scoring
+- telemetry
+- tool_runtime
+- vector_io
+providers:
+  inference:
+  - provider_id: openai
+    provider_type: remote::openai
+    config:
+      api_key: ${env.OPENAI_API_KEY:}
+  - provider_id: anthropic
+    provider_type: remote::anthropic
+    config:
+      api_key: ${env.ANTHROPIC_API_KEY:}
+  - provider_id: gemini
+    provider_type: remote::gemini
+    config:
+      api_key: ${env.GEMINI_API_KEY:}
+  - provider_id: groq
+    provider_type: remote::groq
+    config:
+      url: https://api.groq.com
+      api_key: ${env.GROQ_API_KEY:}
+  - provider_id: together
+    provider_type: remote::together
+    config:
+      url: https://api.together.xyz/v1
+      api_key: ${env.TOGETHER_API_KEY}
+  vector_io:
+  - provider_id: sqlite-vec
+    provider_type: inline::sqlite-vec
+    config:
+      db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/sqlite_vec.db
+  - provider_id: ${env.ENABLE_CHROMADB+chromadb}
+    provider_type: remote::chromadb
+    config:
+      url: ${env.CHROMADB_URL:}
+  - provider_id: ${env.ENABLE_PGVECTOR+pgvector}
+    provider_type: remote::pgvector
+    config:
+      host: ${env.PGVECTOR_HOST:localhost}
+      port: ${env.PGVECTOR_PORT:5432}
+      db: ${env.PGVECTOR_DB:}
+      user: ${env.PGVECTOR_USER:}
+      password: ${env.PGVECTOR_PASSWORD:}
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
+      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
+      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/dev/trace_store.db}
+  eval:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config: {}
+  datasetio:
+  - provider_id: huggingface
+    provider_type: remote::huggingface
+    config: {}
+  - provider_id: localfs
+    provider_type: inline::localfs
+    config: {}
+  scoring:
+  - provider_id: basic
+    provider_type: inline::basic
+    config: {}
+  - provider_id: llm-as-judge
+    provider_type: inline::llm-as-judge
+    config: {}
+  - provider_id: braintrust
+    provider_type: inline::braintrust
+    config:
+      openai_api_key: ${env.OPENAI_API_KEY:}
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: ${env.BRAVE_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: tavily-search
+    provider_type: remote::tavily-search
+    config:
+      api_key: ${env.TAVILY_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: code-interpreter
+    provider_type: inline::code-interpreter
+    config: {}
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+    config: {}
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+    config: {}
+metadata_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/registry.db
+models:
+- metadata: {}
+  model_id: openai/gpt-4o
+  provider_id: openai
+  provider_model_id: openai/gpt-4o
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.1-405B-Instruct
+  provider_id: together
+  provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: anthropic/claude-3-5-sonnet-latest
+  provider_id: anthropic
+  provider_model_id: anthropic/claude-3-5-sonnet-latest
+  model_type: llm
+- metadata: {}
+  model_id: gemini/gemini-1.5-flash
+  provider_id: gemini
+  provider_model_id: gemini/gemini-1.5-flash
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.3-70B-Instruct
+  provider_id: groq
+  provider_model_id: groq/llama-3.3-70b-versatile
+  model_type: llm
+shields:
+- shield_id: meta-llama/Llama-Guard-3-8B
+vector_dbs: []
+datasets:
+  - dataset_id: simpleqa
+    provider_id: huggingface
+    url:
+      uri: https://huggingface.co/datasets/llamastack/simpleqa
+    metadata:
+      path: llamastack/simpleqa
+      name:
+      split: train
+    dataset_schema:
+      input_query:
+        type: string
+      expected_answer:
+        type: string
+      chat_completion_input:
+        type: string
+  - dataset_id: mmlu_cot
+    provider_id: huggingface
+    url:
+      uri: https://huggingface.co/datasets/llamastack/mmlu_cot
+    metadata:
+      path: llamastack/mmlu_cot
+      name: all
+      split: test
+    dataset_schema:
+      input_query:
+        type: string
+      expected_answer:
+        type: string
+      chat_completion_input:
+        type: string
+  - dataset_id: gpqa_cot
+    provider_id: huggingface
+    url:
+      uri: https://huggingface.co/datasets/llamastack/gpqa_0shot_cot
+    metadata:
+      path: llamastack/gpqa_0shot_cot
+      name: gpqa_main
+      split: train
+    dataset_schema:
+      input_query:
+        type: string
+      expected_answer:
+        type: string
+      chat_completion_input:
+        type: string
+scoring_fns: []
+benchmarks:
+  - benchmark_id: meta-reference-simpleqa
+    dataset_id: simpleqa
+    scoring_functions: ["llm-as-judge::405b-simpleqa"]
+  - benchmark_id: meta-reference-mmlu-cot
+    dataset_id: mmlu_cot
+    scoring_functions: ["basic::regex_parser_multiple_choice_answer"]
+  - benchmark_id: meta-reference-gpqa-cot
+    dataset_id: gpqa_cot
+    scoring_functions: ["basic::regex_parser_multiple_choice_answer"]
+tool_groups:
+- toolgroup_id: builtin::websearch
+  provider_id: tavily-search
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
+- toolgroup_id: builtin::code_interpreter
+  provider_id: code-interpreter
+server:
+  port: 8321

From 649d9bc26d77ac1b8210b1bfc93841afae027ee1 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Fri, 7 Mar 2025 13:38:39 -0500
Subject: [PATCH 05/58] fix(security): Bump jinja2 to >=3.1.6 (#1461)

This addresses the new vulnerability
https://github.com/advisories/GHSA-cpwx-vrp4-4pq7.

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 pyproject.toml |  2 +-
 uv.lock        | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a58d01076..d8f3718d8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -79,7 +79,7 @@ docs = [
     "sphinxcontrib.mermaid",
     "tomli",
 ]
-codegen = ["rich", "pydantic", "jinja2"]
+codegen = ["rich", "pydantic", "jinja2>=3.1.6"]
 
 [project.urls]
 Homepage = "https://github.com/meta-llama/llama-stack"
diff --git a/uv.lock b/uv.lock
index ec80d2430..e62d9426e 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,4 +1,5 @@
 version = 1
+revision = 1
 requires-python = ">=3.10"
 resolution-markers = [
     "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
@@ -733,14 +734,14 @@ wheels = [
 
 [[package]]
 name = "jinja2"
-version = "3.1.5"
+version = "3.1.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "markupsafe" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/af/92/b3130cbbf5591acf9ade8708c365f3238046ac7cb8ccba6e81abccb0ccff/jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb", size = 244674 }
+sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/bd/0f/2ba5fbcd631e3e88689309dbe978c5769e883e4b84ebfe7da30b43275c5a/jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb", size = 134596 },
+    { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899 },
 ]
 
 [[package]]
@@ -942,7 +943,7 @@ requires-dist = [
     { name = "groq", marker = "extra == 'test'" },
     { name = "httpx" },
     { name = "huggingface-hub" },
-    { name = "jinja2", marker = "extra == 'codegen'" },
+    { name = "jinja2", marker = "extra == 'codegen'", specifier = ">=3.1.6" },
     { name = "jsonschema" },
     { name = "llama-stack-client", specifier = ">=0.1.4" },
     { name = "lm-format-enforcer", marker = "extra == 'test'", specifier = ">=0.10.9" },
@@ -985,6 +986,7 @@ requires-dist = [
     { name = "types-setuptools", marker = "extra == 'dev'" },
     { name = "uvicorn", marker = "extra == 'dev'" },
 ]
+provides-extras = ["dev", "test", "docs", "codegen"]
 
 [[package]]
 name = "llama-stack-client"

From c4b229f2c96510905b20b400b1e5333afe51417e Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Fri, 7 Mar 2025 13:38:55 -0500
Subject: [PATCH 06/58] chore: Delete unused .gitmodules (#1460)

This is no longer needed after
https://github.com/meta-llama/llama-stack/pull/1265.

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 .gitmodules | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 .gitmodules

diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index e69de29bb..000000000

From 40cd48fa0917860ba0a2500efaf862cdc7861d73 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sat, 8 Mar 2025 02:39:33 +0800
Subject: [PATCH 07/58] chore: remove the incorrect output (#1472)

# What does this PR do?
[Provide a short summary of what this PR does and why. Link to relevant
issues if applicable.]

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

Based on the client output changed, so the output is incorrect:

https://github.com/meta-llama/llama-stack-client-python/blob/458e20702b5aa8f435ac5ce114fee9252b751d25/src/llama_stack_client/lib/cli/models/models.py#L52
and
https://github.com/meta-llama/llama-stack/pull/1348#pullrequestreview-2654971315
previous discussion that no need to maintain the output, so remove it.

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

[//]: # (## Documentation)

Signed-off-by: reidliu <reid201711@gmail.com>
Co-authored-by: reidliu <reid201711@gmail.com>
---
 .../remote_hosted_distro/index.md             | 22 -------------------
 1 file changed, 22 deletions(-)

diff --git a/docs/source/distributions/remote_hosted_distro/index.md b/docs/source/distributions/remote_hosted_distro/index.md
index 2fbe381af..ef5a83d8a 100644
--- a/docs/source/distributions/remote_hosted_distro/index.md
+++ b/docs/source/distributions/remote_hosted_distro/index.md
@@ -17,26 +17,4 @@ $ llama-stack-client configure --endpoint https://llamastack-preview.fireworks.a
 $ llama-stack-client models list
 ```
 
-You will see outputs:
-```
-$ llama-stack-client models list
-+------------------------------+------------------------------+---------------+------------+
-| identifier                   | llama_model                  | provider_id   | metadata   |
-+==============================+==============================+===============+============+
-| Llama3.1-8B-Instruct         | Llama3.1-8B-Instruct         | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.1-70B-Instruct        | Llama3.1-70B-Instruct        | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.1-405B-Instruct       | Llama3.1-405B-Instruct       | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-1B-Instruct         | Llama3.2-1B-Instruct         | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-3B-Instruct         | Llama3.2-3B-Instruct         | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-```
-
 Checkout the [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python/blob/main/docs/cli_reference.md) repo for more details on how to use the `llama-stack-client` CLI. Checkout [llama-stack-app](https://github.com/meta-llama/llama-stack-apps/tree/main) for examples applications built on top of Llama Stack.

From 511afe138150ddfcdc752d8e9f5884ac57c0e4e2 Mon Sep 17 00:00:00 2001
From: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
Date: Fri, 7 Mar 2025 13:41:22 -0500
Subject: [PATCH 08/58] chore: add pytest-report.xml to gitignore (#1473)

# What does this PR do?

Ignores `pytest-report.xml`. The file is produced by the unit tests
github workflow.

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan

Not needed.

[//]: # (## Documentation)

Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index f54d1563d..163b65947 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,3 +20,4 @@ _build
 docs/src
 pyrightconfig.json
 venv/
+pytest-report.xml

From b8c519ba112077c9c749c5780bdb0509509ffeda Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Fri, 7 Mar 2025 10:41:50 -0800
Subject: [PATCH 09/58] feat: rag eval lifecycle notebook (#1458)

# What does this PR do?

- Add RAG eval lifecycle notebook
- Closes https://github.com/meta-llama/llama-stack/issues/1113

- Best reviewed in
https://github.com/meta-llama/llama-stack/blob/rag_eval_notebook/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
Run notebook

[//]: # (## Documentation)
---
 .../notebooks/Llama_Stack_RAG_Lifecycle.ipynb | 1427 +++++++++++++++++
 1 file changed, 1427 insertions(+)
 create mode 100644 docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb

diff --git a/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb b/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
new file mode 100644
index 000000000..0d7b462cc
--- /dev/null
+++ b/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
@@ -0,0 +1,1427 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Llama Stack RAG Lifecycle\n",
+    "\n",
+    "In this notebook, we will walk through the lifecycle of building and evaluating a RAG pipeline using Llama Stack. \n",
+    "\n",
+    "**Example: Torchtune Knowledge Agent** \n",
+    "\n",
+    "Throughout this notebook, we will build a knowledge agent that can answer questions about the Torchtune project. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 0. Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Not in Google Colab environment\n"
+     ]
+    }
+   ],
+   "source": [
+    "from llama_stack_client import LlamaStackClient\n",
+    "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
+    "from llama_stack_client.types.agent_create_params import AgentConfig\n",
+    "from llama_stack_client.lib.agents.agent import Agent\n",
+    "from rich.pretty import pprint\n",
+    "import json\n",
+    "import uuid\n",
+    "from pydantic import BaseModel\n",
+    "import rich\n",
+    "import os\n",
+    "try:\n",
+    "    from google.colab import userdata\n",
+    "    os.environ['FIREWORKS_API_KEY'] = userdata.get('FIREWORKS_API_KEY')\n",
+    "except ImportError:\n",
+    "    print(\"Not in Google Colab environment\")\n",
+    "\n",
+    "# client = LlamaStackAsLibraryClient(\"fireworks\", provider_data = {\"fireworks_api_key\": os.environ['FIREWORKS_API_KEY']})\n",
+    "# _ = client.initialize()\n",
+    "\n",
+    "# Uncomment to run on a hosted Llama Stack server\n",
+    "client = LlamaStackClient(base_url=\"http://localhost:8321\")\n",
+    "\n",
+    "MODEL_ID = \"meta-llama/Llama-3.3-70B-Instruct\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Simple Vanilla Agent\n",
+    "\n",
+    "First, we will build a simple vanilla agent without any access to external knowledge base or tools, and check how it performs on a couple of questions. \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# First, let's come up with a couple of examples to test the agent\n",
+    "examples = [\n",
+    "    {\n",
+    "        \"input_query\": \"What precision formats does torchtune support?\",\n",
+    "        \"expected_answer\": \"Torchtune supports two data types for precision: fp32 (full-precision) which uses 4 bytes per model and optimizer parameter, and bfloat16 (half-precision) which uses 2 bytes per model and optimizer parameter.\"\n",
+    "    },\n",
+    "    {\n",
+    "        \"input_query\": \"What does DoRA stand for in torchtune?\",\n",
+    "        \"expected_answer\": \"Weight-Decomposed Low-Rank Adaptation\"\n",
+    "    },\n",
+    "    {\n",
+    "        \"input_query\": \"How does the CPUOffloadOptimizer reduce GPU memory usage?\",\n",
+    "        \"expected_answer\": \"The CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on CPU and performing optimizer steps on CPU. It can also optionally offload gradients to CPU by using offload_gradients=True\"\n",
+    "    },\n",
+    "    {\n",
+    "        \"input_query\": \"How do I ensure only LoRA parameters are trainable when fine-tuning?\",\n",
+    "        \"expected_answer\": \"You can set only LoRA parameters to trainable using torchtune's utility functions: first fetch all LoRA parameters with lora_params = get_adapter_params(lora_model), then set them as trainable with set_trainable_params(lora_model, lora_params). The LoRA recipe handles this automatically.\"\n",
+    "    }\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">Question:</span> What precision formats does torchtune support?\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;36mQuestion:\u001b[0m What precision formats does torchtune support?\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> Torchtune supports the following precision formats:\n",
+       "\n",
+       "* Full precision <span style=\"font-weight: bold\">(</span>FP32<span style=\"font-weight: bold\">)</span>\n",
+       "* Mixed precision <span style=\"font-weight: bold\">(</span>FP16<span style=\"font-weight: bold\">)</span>\n",
+       "\n",
+       "It may also support other formats such as INT8 and BF16 in the future, but currently, it primarily focuses on FP32 \n",
+       "and FP16. \n",
+       "\n",
+       "Please note that the specific precision formats supported by Torchtune may change over time, and it's always best \n",
+       "to check the official documentation for the most up-to-date information.\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;33mAgent Answer:\u001b[0m Torchtune supports the following precision formats:\n",
+       "\n",
+       "* Full precision \u001b[1m(\u001b[0mFP32\u001b[1m)\u001b[0m\n",
+       "* Mixed precision \u001b[1m(\u001b[0mFP16\u001b[1m)\u001b[0m\n",
+       "\n",
+       "It may also support other formats such as INT8 and BF16 in the future, but currently, it primarily focuses on FP32 \n",
+       "and FP16. \n",
+       "\n",
+       "Please note that the specific precision formats supported by Torchtune may change over time, and it's always best \n",
+       "to check the official documentation for the most up-to-date information.\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">Question:</span> What does DoRA stand for in torchtune?\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;36mQuestion:\u001b[0m What does DoRA stand for in torchtune?\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> In the context of the Torchtune project, DoRA stands for <span style=\"color: #008000; text-decoration-color: #008000\">\"Decoupled Optimizer for Reparameterized </span>\n",
+       "<span style=\"color: #008000; text-decoration-color: #008000\">Architectures\"</span>.\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;33mAgent Answer:\u001b[0m In the context of the Torchtune project, DoRA stands for \u001b[32m\"Decoupled Optimizer for Reparameterized \u001b[0m\n",
+       "\u001b[32mArchitectures\"\u001b[0m.\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">Question:</span> How does the CPUOffloadOptimizer reduce GPU memory usage?\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;36mQuestion:\u001b[0m How does the CPUOffloadOptimizer reduce GPU memory usage?\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> The CPUOffloadOptimizer in the Torchtune project is designed to reduce GPU memory usage by offloading\n",
+       "certain computations from the GPU to the CPU. Here's how it works:\n",
+       "\n",
+       "<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>. **Identifying offloadable operations**: The optimizer analyzes the computation graph of the model and identifies\n",
+       "operations that can be offloaded from the GPU to the CPU. These operations are typically those that don't require \n",
+       "the massive parallel processing capabilities of the GPU, such as data preprocessing, encoding, or decoding.\n",
+       "<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2</span>. **Offloading operations to CPU**: The optimizer offloads the identified operations to the CPU, which frees up \n",
+       "GPU memory and reduces the amount of data that needs to be transferred between the GPU and CPU.\n",
+       "<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>. **Minimizing data transfer**: The optimizer minimizes the amount of data that needs to be transferred between \n",
+       "the GPU and CPU by only transferring the necessary data for the offloaded operations. This reduces the overhead of \n",
+       "data transfer and helps to conserve GPU memory.\n",
+       "<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">4</span>. **Optimizing CPU-GPU synchronization**: The optimizer ensures that the CPU and GPU are properly synchronized, \n",
+       "which helps to prevent unnecessary memory allocations and deallocations on the GPU.\n",
+       "<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">5</span>. **Dynamic memory allocation**: The optimizer can dynamically allocate and deallocate memory on the GPU as \n",
+       "needed, which helps to reduce memory fragmentation and waste.\n",
+       "\n",
+       "By offloading computations to the CPU and minimizing data transfer, the CPUOffloadOptimizer can significantly \n",
+       "reduce GPU memory usage, which can lead to:\n",
+       "\n",
+       "* Improved model training and inference performance\n",
+       "* Increased batch sizes and throughput\n",
+       "* Reduced out-of-memory errors\n",
+       "* Better support for larger models and datasets\n",
+       "\n",
+       "Overall, the CPUOffloadOptimizer is a powerful tool for optimizing GPU memory usage in deep learning workloads, and\n",
+       "can help to improve the overall performance and efficiency of the Torchtune project.\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;33mAgent Answer:\u001b[0m The CPUOffloadOptimizer in the Torchtune project is designed to reduce GPU memory usage by offloading\n",
+       "certain computations from the GPU to the CPU. Here's how it works:\n",
+       "\n",
+       "\u001b[1;36m1\u001b[0m. **Identifying offloadable operations**: The optimizer analyzes the computation graph of the model and identifies\n",
+       "operations that can be offloaded from the GPU to the CPU. These operations are typically those that don't require \n",
+       "the massive parallel processing capabilities of the GPU, such as data preprocessing, encoding, or decoding.\n",
+       "\u001b[1;36m2\u001b[0m. **Offloading operations to CPU**: The optimizer offloads the identified operations to the CPU, which frees up \n",
+       "GPU memory and reduces the amount of data that needs to be transferred between the GPU and CPU.\n",
+       "\u001b[1;36m3\u001b[0m. **Minimizing data transfer**: The optimizer minimizes the amount of data that needs to be transferred between \n",
+       "the GPU and CPU by only transferring the necessary data for the offloaded operations. This reduces the overhead of \n",
+       "data transfer and helps to conserve GPU memory.\n",
+       "\u001b[1;36m4\u001b[0m. **Optimizing CPU-GPU synchronization**: The optimizer ensures that the CPU and GPU are properly synchronized, \n",
+       "which helps to prevent unnecessary memory allocations and deallocations on the GPU.\n",
+       "\u001b[1;36m5\u001b[0m. **Dynamic memory allocation**: The optimizer can dynamically allocate and deallocate memory on the GPU as \n",
+       "needed, which helps to reduce memory fragmentation and waste.\n",
+       "\n",
+       "By offloading computations to the CPU and minimizing data transfer, the CPUOffloadOptimizer can significantly \n",
+       "reduce GPU memory usage, which can lead to:\n",
+       "\n",
+       "* Improved model training and inference performance\n",
+       "* Increased batch sizes and throughput\n",
+       "* Reduced out-of-memory errors\n",
+       "* Better support for larger models and datasets\n",
+       "\n",
+       "Overall, the CPUOffloadOptimizer is a powerful tool for optimizing GPU memory usage in deep learning workloads, and\n",
+       "can help to improve the overall performance and efficiency of the Torchtune project.\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">Question:</span> How do I ensure only LoRA parameters are trainable when fine-tuning?\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;36mQuestion:\u001b[0m How do I ensure only LoRA parameters are trainable when fine-tuning?\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> To ensure only LoRA <span style=\"font-weight: bold\">(</span>Low-Rank Adaptation<span style=\"font-weight: bold\">)</span> parameters are trainable when fine-tuning a model with \n",
+       "Torchtune, you can follow these steps:\n",
+       "\n",
+       "<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>. **Freeze the original model weights**: Before fine-tuning, you need to freeze the original model weights to \n",
+       "prevent them from being updated during the fine-tuning process. You can do this by setting the `requires_grad` \n",
+       "attribute of the model parameters to `<span style=\"color: #ff0000; text-decoration-color: #ff0000; font-style: italic\">False</span>`. This will prevent the original model weights from being updated.\n",
+       "\n",
+       "<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2</span>. **Create LoRA parameters**: Create LoRA parameters for the layers you want to fine-tune. LoRA parameters are \n",
+       "typically added to the original model weights to adapt the model to the new task.\n",
+       "\n",
+       "<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>. **Set LoRA parameters as trainable**: Set the LoRA parameters as trainable by setting their `requires_grad` \n",
+       "attribute to `<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>`. This will allow the LoRA parameters to be updated during the fine-tuning process.\n",
+       "\n",
+       "Here's a sample code snippet to illustrate this:\n",
+       "```python\n",
+       "import torch\n",
+       "import torch.nn as nn\n",
+       "\n",
+       "# Assume <span style=\"color: #008000; text-decoration-color: #008000\">'model'</span> is your pre-trained model\n",
+       "model = <span style=\"color: #808000; text-decoration-color: #808000\">...</span>\n",
+       "\n",
+       "# Freeze the original model weights\n",
+       "for param in <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">model.parameters</span><span style=\"font-weight: bold\">()</span>:\n",
+       "    param.requires_grad = <span style=\"color: #ff0000; text-decoration-color: #ff0000; font-style: italic\">False</span>\n",
+       "\n",
+       "# Create LoRA parameters\n",
+       "lora_params = <span style=\"font-weight: bold\">[]</span>\n",
+       "for name, module in <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">model.named_modules</span><span style=\"font-weight: bold\">()</span>:\n",
+       "    if <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">isinstance</span><span style=\"font-weight: bold\">(</span>module, nn.Linear<span style=\"font-weight: bold\">)</span>:  # or any other module you want to fine-tune\n",
+       "        lora_param = <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">nn.Parameter</span><span style=\"font-weight: bold\">(</span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">torch.randn</span><span style=\"font-weight: bold\">(</span>module.weight.shape<span style=\"font-weight: bold\">))</span>\n",
+       "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">lora_params.append</span><span style=\"font-weight: bold\">(</span>lora_param<span style=\"font-weight: bold\">)</span>\n",
+       "        <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">setattr</span><span style=\"font-weight: bold\">(</span>model, f\"<span style=\"font-weight: bold\">{</span>name<span style=\"font-weight: bold\">}</span>_lora\", lora_param<span style=\"font-weight: bold\">)</span>\n",
+       "\n",
+       "# Set LoRA parameters as trainable\n",
+       "for param in lora_params:\n",
+       "    param.requires_grad = <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>\n",
+       "\n",
+       "# Fine-tune the model with LoRA parameters\n",
+       "optimizer = <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">torch.optim.Adam</span><span style=\"font-weight: bold\">(</span>lora_params, <span style=\"color: #808000; text-decoration-color: #808000\">lr</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1e-4</span><span style=\"font-weight: bold\">)</span>\n",
+       "```\n",
+       "By following these steps, you can ensure that only the LoRA parameters are trainable during fine-tuning, while \n",
+       "keeping the original model weights frozen.\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;33mAgent Answer:\u001b[0m To ensure only LoRA \u001b[1m(\u001b[0mLow-Rank Adaptation\u001b[1m)\u001b[0m parameters are trainable when fine-tuning a model with \n",
+       "Torchtune, you can follow these steps:\n",
+       "\n",
+       "\u001b[1;36m1\u001b[0m. **Freeze the original model weights**: Before fine-tuning, you need to freeze the original model weights to \n",
+       "prevent them from being updated during the fine-tuning process. You can do this by setting the `requires_grad` \n",
+       "attribute of the model parameters to `\u001b[3;91mFalse\u001b[0m`. This will prevent the original model weights from being updated.\n",
+       "\n",
+       "\u001b[1;36m2\u001b[0m. **Create LoRA parameters**: Create LoRA parameters for the layers you want to fine-tune. LoRA parameters are \n",
+       "typically added to the original model weights to adapt the model to the new task.\n",
+       "\n",
+       "\u001b[1;36m3\u001b[0m. **Set LoRA parameters as trainable**: Set the LoRA parameters as trainable by setting their `requires_grad` \n",
+       "attribute to `\u001b[3;92mTrue\u001b[0m`. This will allow the LoRA parameters to be updated during the fine-tuning process.\n",
+       "\n",
+       "Here's a sample code snippet to illustrate this:\n",
+       "```python\n",
+       "import torch\n",
+       "import torch.nn as nn\n",
+       "\n",
+       "# Assume \u001b[32m'model'\u001b[0m is your pre-trained model\n",
+       "model = \u001b[33m...\u001b[0m\n",
+       "\n",
+       "# Freeze the original model weights\n",
+       "for param in \u001b[1;35mmodel.parameters\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m:\n",
+       "    param.requires_grad = \u001b[3;91mFalse\u001b[0m\n",
+       "\n",
+       "# Create LoRA parameters\n",
+       "lora_params = \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
+       "for name, module in \u001b[1;35mmodel.named_modules\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m:\n",
+       "    if \u001b[1;35misinstance\u001b[0m\u001b[1m(\u001b[0mmodule, nn.Linear\u001b[1m)\u001b[0m:  # or any other module you want to fine-tune\n",
+       "        lora_param = \u001b[1;35mnn.Parameter\u001b[0m\u001b[1m(\u001b[0m\u001b[1;35mtorch.randn\u001b[0m\u001b[1m(\u001b[0mmodule.weight.shape\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m\n",
+       "        \u001b[1;35mlora_params.append\u001b[0m\u001b[1m(\u001b[0mlora_param\u001b[1m)\u001b[0m\n",
+       "        \u001b[1;35msetattr\u001b[0m\u001b[1m(\u001b[0mmodel, f\"\u001b[1m{\u001b[0mname\u001b[1m}\u001b[0m_lora\", lora_param\u001b[1m)\u001b[0m\n",
+       "\n",
+       "# Set LoRA parameters as trainable\n",
+       "for param in lora_params:\n",
+       "    param.requires_grad = \u001b[3;92mTrue\u001b[0m\n",
+       "\n",
+       "# Fine-tune the model with LoRA parameters\n",
+       "optimizer = \u001b[1;35mtorch.optim.Adam\u001b[0m\u001b[1m(\u001b[0mlora_params, \u001b[33mlr\u001b[0m=\u001b[1;36m1e\u001b[0m\u001b[1;36m-4\u001b[0m\u001b[1m)\u001b[0m\n",
+       "```\n",
+       "By following these steps, you can ensure that only the LoRA parameters are trainable during fine-tuning, while \n",
+       "keeping the original model weights frozen.\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "simple_agent = Agent(client,\n",
+    "                     model=MODEL_ID, \n",
+    "                     instructions=\"You are a helpful assistant that can answer questions about the Torchtune project.\")\n",
+    "for example in examples:\n",
+    "    simple_session_id = simple_agent.create_session(session_name=f\"simple_session_{uuid.uuid4()}\")\n",
+    "    response = simple_agent.create_turn(\n",
+    "        messages=[\n",
+    "            {\n",
+    "                \"role\": \"user\",\n",
+    "                \"content\": example[\"input_query\"]\n",
+    "            }\n",
+    "        ],\n",
+    "        session_id=simple_session_id,\n",
+    "        stream=False\n",
+    "    )\n",
+    "    rich.print(f\"[bold cyan]Question:[/bold cyan] {example['input_query']}\")\n",
+    "    rich.print(f\"[bold yellow]Agent Answer:[/bold yellow] {response.output_message.content}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 1.1 Evaluate Agent Responses\n",
+    "Let's gather up the agent's logs and evaluate the agent's performance. We can see that our agent's response is quite bad and off from the expected answer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">ScoringScoreResponse</span><span style=\"font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"color: #808000; text-decoration-color: #808000\">results</span>=<span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'braintrust::factuality'</span>: <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">ScoringResult</span><span style=\"font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">aggregated_results</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'average'</span>: <span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'average'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.3</span><span style=\"font-weight: bold\">}}</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">score_rows</span>=<span style=\"font-weight: bold\">[</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.0</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'D'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'1. **Expert Answer**: The expert states that Torchtune supports two precision formats: fp32 (full-precision) and bfloat16 (half-precision).\\n\\n2. **Submitted Answer**: The submission mentions that Torchtune supports full precision (FP32) and mixed precision (FP16). It also speculates about potential future support for other formats like INT8 and BF16, but emphasizes the current focus on FP32 and FP16.\\n\\n3. **Comparison**:\\n   - Both answers agree on the support for FP32.\\n   - The expert mentions bfloat16 (BF16), while the submission mentions FP16 and speculates about BF16 in the future. This is a key difference as the expert confirms BF16 support, whereas the submission does not.\\n   - The submission introduces FP16, which is not mentioned by the expert.\\n   - The submission also speculates about future support for INT8 and BF16, which is not addressed by the expert.\\n\\n4. **Conclusion**: There is a disagreement between the submitted answer and the expert answer regarding the precision formats supported by Torchtune. The expert confirms BF16 support, while the submission does not, and instead mentions FP16, which the expert does not confirm. Therefore, the correct choice is (D).'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.0</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'D'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'1. The expert answer states that DoRA stands for \"Weight-Decomposed Low-Rank Adaptation\".\\n2. The submitted answer states that DoRA stands for \"Decoupled Optimizer for Reparameterized Architectures\".\\n3. The two answers provide completely different expansions for the acronym DoRA.\\n4. Since the expansions are different, there is a clear disagreement between the submitted answer and the expert answer regarding what DoRA stands for in the context of torchtune.\\n5. Therefore, the correct choice is (D) There is a disagreement between the submitted answer and the expert answer.'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.6</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'B'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'1. The expert answer states that the CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on the CPU and performing optimizer steps on the CPU. It also mentions the optional offloading of gradients to the CPU.\\n2. The submitted answer describes a broader mechanism of offloading computations from the GPU to the CPU, including identifying offloadable operations, minimizing data transfer, optimizing CPU-GPU synchronization, and dynamic memory allocation.\\n3. The submitted answer does not explicitly mention keeping optimizer states on the CPU or performing optimizer steps on the CPU, which are key points in the expert answer.\\n4. The submitted answer provides additional details about the process of offloading operations and its benefits, which are not mentioned in the expert answer.\\n5. The submitted answer does not conflict with the expert answer but rather expands on the concept of offloading to the CPU with additional mechanisms and benefits.\\n\\nBased on this analysis, the submitted answer is a superset of the expert answer and is fully consistent with it, as it includes all the information from the expert answer and adds more details.'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.6</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'B'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">\"1. **Expert Answer Summary**: The expert answer provides a concise method to ensure only LoRA parameters are trainable by using torchtune's utility functions. It mentions fetching LoRA parameters with `get_adapter_params(lora_model)` and setting them as trainable with `set_trainable_params(lora_model, lora_params)`. It also notes that the LoRA recipe handles this automatically.\\n\\n2. **Submitted Answer Summary**: The submitted answer provides a more detailed explanation, including steps to freeze the original model weights, create LoRA parameters, and set them as trainable. It includes a code snippet demonstrating these steps, using PyTorch to manually set `requires_grad` attributes.\\n\\n3. **Comparison**:\\n   - Both answers aim to ensure only LoRA parameters are trainable.\\n   - The expert answer uses torchtune's utility functions, while the submitted answer provides a manual method using PyTorch.\\n   - The submitted answer includes additional steps and a code example, which are not present in the expert answer.\\n\\n4. **Conclusion**: The submitted answer is a superset of the expert answer. It includes all the information from the expert answer (ensuring only LoRA parameters are trainable) and adds more detail on how to achieve this manually. There is no conflict between the two answers, as they both achieve the same goal using different methods.\\n\\nTherefore, the correct choice is (B) The submitted answer is a superset of the expert answer and is fully consistent with it.\"</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"font-weight: bold\">]</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"font-weight: bold\">)</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"font-weight: bold\">)</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;35mScoringScoreResponse\u001b[0m\u001b[1m(\u001b[0m\n",
+       "\u001b[2;32m│   \u001b[0m\u001b[33mresults\u001b[0m=\u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   \u001b[0m\u001b[32m'braintrust::factuality'\u001b[0m: \u001b[1;35mScoringResult\u001b[0m\u001b[1m(\u001b[0m\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[33maggregated_results\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'average'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'average'\u001b[0m: \u001b[1;36m0.3\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[33mscore_rows\u001b[0m=\u001b[1m[\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.0\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'D'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. **Expert Answer**: The expert states that Torchtune supports two precision formats: fp32 \u001b[0m\u001b[32m(\u001b[0m\u001b[32mfull-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and bfloat16 \u001b[0m\u001b[32m(\u001b[0m\u001b[32mhalf-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n\\n2. **Submitted Answer**: The submission mentions that Torchtune supports full precision \u001b[0m\u001b[32m(\u001b[0m\u001b[32mFP32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and mixed precision \u001b[0m\u001b[32m(\u001b[0m\u001b[32mFP16\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. It also speculates about potential future support for other formats like INT8 and BF16, but emphasizes the current focus on FP32 and FP16.\\n\\n3. **Comparison**:\\n   - Both answers agree on the support for FP32.\\n   - The expert mentions bfloat16 \u001b[0m\u001b[32m(\u001b[0m\u001b[32mBF16\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, while the submission mentions FP16 and speculates about BF16 in the future. This is a key difference as the expert confirms BF16 support, whereas the submission does not.\\n   - The submission introduces FP16, which is not mentioned by the expert.\\n   - The submission also speculates about future support for INT8 and BF16, which is not addressed by the expert.\\n\\n4. **Conclusion**: There is a disagreement between the submitted answer and the expert answer regarding the precision formats supported by Torchtune. The expert confirms BF16 support, while the submission does not, and instead mentions FP16, which the expert does not confirm. Therefore, the correct choice is \u001b[0m\u001b[32m(\u001b[0m\u001b[32mD\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.0\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'D'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. The expert answer states that DoRA stands for \"Weight-Decomposed Low-Rank Adaptation\".\\n2. The submitted answer states that DoRA stands for \"Decoupled Optimizer for Reparameterized Architectures\".\\n3. The two answers provide completely different expansions for the acronym DoRA.\\n4. Since the expansions are different, there is a clear disagreement between the submitted answer and the expert answer regarding what DoRA stands for in the context of torchtune.\\n5. Therefore, the correct choice is \u001b[0m\u001b[32m(\u001b[0m\u001b[32mD\u001b[0m\u001b[32m)\u001b[0m\u001b[32m There is a disagreement between the submitted answer and the expert answer.'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. The expert answer states that the CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on the CPU and performing optimizer steps on the CPU. It also mentions the optional offloading of gradients to the CPU.\\n2. The submitted answer describes a broader mechanism of offloading computations from the GPU to the CPU, including identifying offloadable operations, minimizing data transfer, optimizing CPU-GPU synchronization, and dynamic memory allocation.\\n3. The submitted answer does not explicitly mention keeping optimizer states on the CPU or performing optimizer steps on the CPU, which are key points in the expert answer.\\n4. The submitted answer provides additional details about the process of offloading operations and its benefits, which are not mentioned in the expert answer.\\n5. The submitted answer does not conflict with the expert answer but rather expands on the concept of offloading to the CPU with additional mechanisms and benefits.\\n\\nBased on this analysis, the submitted answer is a superset of the expert answer and is fully consistent with it, as it includes all the information from the expert answer and adds more details.'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m\"1. **Expert Answer Summary**: The expert answer provides a concise method to ensure only LoRA parameters are trainable by using torchtune's utility functions. It mentions fetching LoRA parameters with `get_adapter_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model\u001b[0m\u001b[32m)\u001b[0m\u001b[32m` and setting them as trainable with `set_trainable_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model, lora_params\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`. It also notes that the LoRA recipe handles this automatically.\\n\\n2. **Submitted Answer Summary**: The submitted answer provides a more detailed explanation, including steps to freeze the original model weights, create LoRA parameters, and set them as trainable. It includes a code snippet demonstrating these steps, using PyTorch to manually set `requires_grad` attributes.\\n\\n3. **Comparison**:\\n   - Both answers aim to ensure only LoRA parameters are trainable.\\n   - The expert answer uses torchtune's utility functions, while the submitted answer provides a manual method using PyTorch.\\n   - The submitted answer includes additional steps and a code example, which are not present in the expert answer.\\n\\n4. **Conclusion**: The submitted answer is a superset of the expert answer. It includes all the information from the expert answer \u001b[0m\u001b[32m(\u001b[0m\u001b[32mensuring only LoRA parameters are trainable\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and adds more detail on how to achieve this manually. There is no conflict between the two answers, as they both achieve the same goal using different methods.\\n\\nTherefore, the correct choice is \u001b[0m\u001b[32m(\u001b[0m\u001b[32mB\u001b[0m\u001b[32m)\u001b[0m\u001b[32m The submitted answer is a superset of the expert answer and is fully consistent with it.\"\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[1m]\u001b[0m\n",
+       "\u001b[2;32m│   │   \u001b[0m\u001b[1m)\u001b[0m\n",
+       "\u001b[2;32m│   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[1m)\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "eval_rows = []\n",
+    "for i, session_id in enumerate(simple_agent.sessions):\n",
+    "    session_response = client.agents.session.retrieve(agent_id=simple_agent.agent_id, session_id=session_id)\n",
+    "    for turn in session_response.turns:\n",
+    "        eval_rows.append({\n",
+    "            \"input_query\": examples[i][\"input_query\"],\n",
+    "            \"expected_answer\": examples[i][\"expected_answer\"],\n",
+    "            \"generated_answer\": turn.output_message.content,\n",
+    "        })\n",
+    "\n",
+    "scoring_params = {\n",
+    "    \"braintrust::factuality\": None,\n",
+    "}\n",
+    "scoring_response = client.scoring.score(\n",
+    "    input_rows=eval_rows,\n",
+    "    scoring_functions=scoring_params,\n",
+    ")\n",
+    "pprint(scoring_response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Search Agent\n",
+    "\n",
+    "Now, let's see how we can improve the agent's performance by adding a search tool."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">Question:</span> What precision formats does torchtune support?\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;36mQuestion:\u001b[0m What precision formats does torchtune support?\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> Torchtune supports the following precision formats:\n",
+       "\n",
+       "* bf16 <span style=\"font-weight: bold\">(</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">16</span>-bit floating-point format<span style=\"font-weight: bold\">)</span>\n",
+       "* fp32 <span style=\"font-weight: bold\">(</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">32</span>-bit floating-point format, also known as <span style=\"color: #008000; text-decoration-color: #008000\">\"full-precision\"</span><span style=\"font-weight: bold\">)</span>\n",
+       "\n",
+       "It's worth noting that torchtune also provides support for mixed-precision techniques, which allow for the use of \n",
+       "different precision formats for different parts of the model or during different stages of training.\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;33mAgent Answer:\u001b[0m Torchtune supports the following precision formats:\n",
+       "\n",
+       "* bf16 \u001b[1m(\u001b[0m\u001b[1;36m16\u001b[0m-bit floating-point format\u001b[1m)\u001b[0m\n",
+       "* fp32 \u001b[1m(\u001b[0m\u001b[1;36m32\u001b[0m-bit floating-point format, also known as \u001b[32m\"full-precision\"\u001b[0m\u001b[1m)\u001b[0m\n",
+       "\n",
+       "It's worth noting that torchtune also provides support for mixed-precision techniques, which allow for the use of \n",
+       "different precision formats for different parts of the model or during different stages of training.\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">Question:</span> What does DoRA stand for in torchtune?\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;36mQuestion:\u001b[0m What does DoRA stand for in torchtune?\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> DoRA stands for <span style=\"color: #008000; text-decoration-color: #008000\">\"Decoupled Orthogonal Random Adaptation\"</span> in torchtune.\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;33mAgent Answer:\u001b[0m DoRA stands for \u001b[32m\"Decoupled Orthogonal Random Adaptation\"\u001b[0m in torchtune.\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">Question:</span> How does the CPUOffloadOptimizer reduce GPU memory usage?\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;36mQuestion:\u001b[0m How does the CPUOffloadOptimizer reduce GPU memory usage?\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> The CPUOffloadOptimizer reduces GPU memory usage by offloading gradients and trainable parameters to \n",
+       "the CPU, allowing for more efficient use of GPU memory. This can be achieved by setting `<span style=\"color: #808000; text-decoration-color: #808000\">offload_gradients</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>` in\n",
+       "the CPUOffloadOptimizer, which frees gradients once device-to-host transfer finishes. Additionally, using paged \n",
+       "Adam with `<span style=\"color: #808000; text-decoration-color: #808000\">optimizer_in_bwd</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>` can also help reduce memory usage. However, it's important to note that the \n",
+       "actual memory usage may vary depending on the specific use case and model architecture.\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;33mAgent Answer:\u001b[0m The CPUOffloadOptimizer reduces GPU memory usage by offloading gradients and trainable parameters to \n",
+       "the CPU, allowing for more efficient use of GPU memory. This can be achieved by setting `\u001b[33moffload_gradients\u001b[0m=\u001b[3;92mTrue\u001b[0m` in\n",
+       "the CPUOffloadOptimizer, which frees gradients once device-to-host transfer finishes. Additionally, using paged \n",
+       "Adam with `\u001b[33moptimizer_in_bwd\u001b[0m=\u001b[3;92mTrue\u001b[0m` can also help reduce memory usage. However, it's important to note that the \n",
+       "actual memory usage may vary depending on the specific use case and model architecture.\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">Question:</span> How do I ensure only LoRA parameters are trainable when fine-tuning?\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;36mQuestion:\u001b[0m How do I ensure only LoRA parameters are trainable when fine-tuning?\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> To ensure only LoRA parameters are trainable when fine-tuning, you can use the `set_trainable_params`\n",
+       "function from the `torchtune.modules.peft.peft_utils` module. This function allows you to specify which parameters \n",
+       "to make trainable, and you can use it to set only the LoRA parameters as trainable.\n",
+       "\n",
+       "Here is an example of how to do this:\n",
+       "```\n",
+       "import torch\n",
+       "from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n",
+       "from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n",
+       "\n",
+       "# Load the model and adapter\n",
+       "model = <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">llama2_7b</span><span style=\"font-weight: bold\">()</span>\n",
+       "adapter = <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">lora_llama2_7b</span><span style=\"font-weight: bold\">()</span>\n",
+       "\n",
+       "# Get the adapter parameters\n",
+       "adapter_params = <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">get_adapter_params</span><span style=\"font-weight: bold\">(</span>adapter<span style=\"font-weight: bold\">)</span>\n",
+       "\n",
+       "# Set only the adapter parameters as trainable\n",
+       "<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">set_trainable_params</span><span style=\"font-weight: bold\">(</span>model, adapter_params<span style=\"font-weight: bold\">)</span>\n",
+       "```\n",
+       "This code loads the LLaMA-<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2</span> model and the LoRA adapter, gets the adapter parameters, and then sets only those \n",
+       "parameters as trainable using the `set_trainable_params` function. This ensures that only the LoRA parameters are \n",
+       "updated during fine-tuning, while the rest of the model remains frozen.\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;33mAgent Answer:\u001b[0m To ensure only LoRA parameters are trainable when fine-tuning, you can use the `set_trainable_params`\n",
+       "function from the `torchtune.modules.peft.peft_utils` module. This function allows you to specify which parameters \n",
+       "to make trainable, and you can use it to set only the LoRA parameters as trainable.\n",
+       "\n",
+       "Here is an example of how to do this:\n",
+       "```\n",
+       "import torch\n",
+       "from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n",
+       "from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n",
+       "\n",
+       "# Load the model and adapter\n",
+       "model = \u001b[1;35mllama2_7b\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n",
+       "adapter = \u001b[1;35mlora_llama2_7b\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\n",
+       "\n",
+       "# Get the adapter parameters\n",
+       "adapter_params = \u001b[1;35mget_adapter_params\u001b[0m\u001b[1m(\u001b[0madapter\u001b[1m)\u001b[0m\n",
+       "\n",
+       "# Set only the adapter parameters as trainable\n",
+       "\u001b[1;35mset_trainable_params\u001b[0m\u001b[1m(\u001b[0mmodel, adapter_params\u001b[1m)\u001b[0m\n",
+       "```\n",
+       "This code loads the LLaMA-\u001b[1;36m2\u001b[0m model and the LoRA adapter, gets the adapter parameters, and then sets only those \n",
+       "parameters as trainable using the `set_trainable_params` function. This ensures that only the LoRA parameters are \n",
+       "updated during fine-tuning, while the rest of the model remains frozen.\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "search_agent = Agent(client, \n",
+    "                     model=MODEL_ID,\n",
+    "                     instructions=\"You are a helpful assistant that can answer questions about the Torchtune project. You should always use the search tool to answer questions.\",\n",
+    "                     tools=[\"builtin::websearch\"])\n",
+    "for example in examples:\n",
+    "    search_session_id = search_agent.create_session(session_name=f\"search_session_{uuid.uuid4()}\")\n",
+    "    response = search_agent.create_turn(\n",
+    "        messages=[\n",
+    "            {\n",
+    "                \"role\": \"user\",\n",
+    "                \"content\": example[\"input_query\"]\n",
+    "            }\n",
+    "        ],\n",
+    "        session_id=search_session_id,\n",
+    "        stream=False\n",
+    "    )\n",
+    "    rich.print(f\"[bold cyan]Question:[/bold cyan] {example['input_query']}\")\n",
+    "    rich.print(f\"[bold yellow]Agent Answer:[/bold yellow] {response.output_message.content}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 2.1 Evaluate Agent Responses\n",
+    "\n",
+    "We can see that with a search tool, the agent's performance is much better, and have less hallucinations. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">ScoringScoreResponse</span><span style=\"font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"color: #808000; text-decoration-color: #808000\">results</span>=<span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'braintrust::factuality'</span>: <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">ScoringResult</span><span style=\"font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">aggregated_results</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'average'</span>: <span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'average'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.44999999999999996</span><span style=\"font-weight: bold\">}}</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">score_rows</span>=<span style=\"font-weight: bold\">[</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.6</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'B'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'1. **Expert Answer Details**: The expert answer states that Torchtune supports two precision formats: fp32 (full-precision) and bfloat16 (half-precision).\\n\\n2. **Submitted Answer Details**: The submitted answer mentions two precision formats: bf16 (16-bit floating-point format) and fp32 (32-bit floating-point format, also known as \"full-precision\"). It also adds that Torchtune supports mixed-precision techniques.\\n\\n3. **Comparison of Precision Formats**:\\n   - The expert answer uses \"bfloat16\" while the submitted answer uses \"bf16\". These are equivalent terms, as \"bf16\" is a common abbreviation for \"bfloat16\".\\n   - Both answers mention \"fp32\" as a supported precision format.\\n\\n4. **Additional Information in Submission**: The submitted answer includes additional information about mixed-precision techniques, which is not mentioned in the expert answer.\\n\\n5. **Consistency Check**: The submitted answer includes all the information from the expert answer and adds more details about mixed-precision techniques. There is no conflict between the two answers.\\n\\nBased on the above analysis, the submitted answer is a superset of the expert answer and is fully consistent with it.'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.0</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'D'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'1. The expert answer states that DoRA stands for \"Weight-Decomposed Low-Rank Adaptation.\"\\n2. The submitted answer states that DoRA stands for \"Decoupled Orthogonal Random Adaptation.\"\\n3. The two answers provide completely different expansions for the acronym DoRA.\\n4. Since the expansions are different, there is a clear disagreement between the submitted answer and the expert answer regarding what DoRA stands for in torchtune.\\n5. Therefore, the correct choice is (D) There is a disagreement between the submitted answer and the expert answer.'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.6</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'B'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'1. **Expert Answer Analysis**: The expert answer states that the CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on the CPU and performing optimizer steps on the CPU. It also mentions the optional offloading of gradients to the CPU by setting `offload_gradients=True`.\\n\\n2. **Submitted Answer Analysis**: The submitted answer mentions offloading gradients and trainable parameters to the CPU, which allows for more efficient use of GPU memory. It specifies the use of `offload_gradients=True` to free gradients after device-to-host transfer. Additionally, it introduces the concept of using paged Adam with `optimizer_in_bwd=True` to help reduce memory usage. It also notes that actual memory usage may vary depending on the use case and model architecture.\\n\\n3. **Comparison**:\\n   - Both answers mention offloading gradients to the CPU using `offload_gradients=True`.\\n   - The expert answer focuses on keeping optimizer states and performing optimizer steps on the CPU, while the submitted answer expands on this by mentioning trainable parameters and the use of paged Adam.\\n   - The submitted answer provides additional context about memory usage variability and the use of paged Adam, which is not mentioned in the expert answer.\\n\\n4. **Conclusion**: The submitted answer is a superset of the expert answer as it includes all the information from the expert answer and adds more details about trainable parameters, paged Adam, and memory usage variability. There is no conflict between the two answers, and the additional information in the submitted answer is consistent with the expert answer.\\n\\nTherefore, the correct choice is (B) The submitted answer is a superset of the expert answer and is fully consistent with it.'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.6</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'B'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">\"1. **Expert Answer Analysis**: The expert answer provides a method to ensure only LoRA parameters are trainable by using torchtune's utility functions. It mentions fetching LoRA parameters with `get_adapter_params(lora_model)` and setting them as trainable with `set_trainable_params(lora_model, lora_params)`. It also notes that the LoRA recipe handles this automatically.\\n\\n2. **Submitted Answer Analysis**: The submitted answer provides a detailed example of how to ensure only LoRA parameters are trainable. It uses the `set_trainable_params` function from `torchtune.modules.peft.peft_utils` and provides a code example that includes loading a model and adapter, fetching adapter parameters, and setting them as trainable.\\n\\n3. **Comparison**:\\n   - Both answers mention the use of `set_trainable_params` to set LoRA parameters as trainable.\\n   - Both answers involve fetching LoRA parameters using a function (`get_adapter_params`).\\n   - The submitted answer provides additional context by including a code example and specifying the module path for the functions used.\\n   - The expert answer mentions that the LoRA recipe handles this automatically, which is not explicitly stated in the submitted answer.\\n\\n4. **Conclusion**: The submitted answer is a superset of the expert answer. It includes all the information from the expert answer and adds more detail, such as a code example and specific module paths. There is no conflict between the two answers, and the additional information in the submitted answer is consistent with the expert answer.\"</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"font-weight: bold\">]</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"font-weight: bold\">)</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"font-weight: bold\">)</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;35mScoringScoreResponse\u001b[0m\u001b[1m(\u001b[0m\n",
+       "\u001b[2;32m│   \u001b[0m\u001b[33mresults\u001b[0m=\u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   \u001b[0m\u001b[32m'braintrust::factuality'\u001b[0m: \u001b[1;35mScoringResult\u001b[0m\u001b[1m(\u001b[0m\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[33maggregated_results\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'average'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'average'\u001b[0m: \u001b[1;36m0.44999999999999996\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[33mscore_rows\u001b[0m=\u001b[1m[\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. **Expert Answer Details**: The expert answer states that Torchtune supports two precision formats: fp32 \u001b[0m\u001b[32m(\u001b[0m\u001b[32mfull-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and bfloat16 \u001b[0m\u001b[32m(\u001b[0m\u001b[32mhalf-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n\\n2. **Submitted Answer Details**: The submitted answer mentions two precision formats: bf16 \u001b[0m\u001b[32m(\u001b[0m\u001b[32m16-bit floating-point format\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and fp32 \u001b[0m\u001b[32m(\u001b[0m\u001b[32m32-bit floating-point format, also known as \"full-precision\"\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. It also adds that Torchtune supports mixed-precision techniques.\\n\\n3. **Comparison of Precision Formats**:\\n   - The expert answer uses \"bfloat16\" while the submitted answer uses \"bf16\". These are equivalent terms, as \"bf16\" is a common abbreviation for \"bfloat16\".\\n   - Both answers mention \"fp32\" as a supported precision format.\\n\\n4. **Additional Information in Submission**: The submitted answer includes additional information about mixed-precision techniques, which is not mentioned in the expert answer.\\n\\n5. **Consistency Check**: The submitted answer includes all the information from the expert answer and adds more details about mixed-precision techniques. There is no conflict between the two answers.\\n\\nBased on the above analysis, the submitted answer is a superset of the expert answer and is fully consistent with it.'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.0\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'D'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. The expert answer states that DoRA stands for \"Weight-Decomposed Low-Rank Adaptation.\"\\n2. The submitted answer states that DoRA stands for \"Decoupled Orthogonal Random Adaptation.\"\\n3. The two answers provide completely different expansions for the acronym DoRA.\\n4. Since the expansions are different, there is a clear disagreement between the submitted answer and the expert answer regarding what DoRA stands for in torchtune.\\n5. Therefore, the correct choice is \u001b[0m\u001b[32m(\u001b[0m\u001b[32mD\u001b[0m\u001b[32m)\u001b[0m\u001b[32m There is a disagreement between the submitted answer and the expert answer.'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. **Expert Answer Analysis**: The expert answer states that the CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on the CPU and performing optimizer steps on the CPU. It also mentions the optional offloading of gradients to the CPU by setting `\u001b[0m\u001b[32moffload_gradients\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m`.\\n\\n2. **Submitted Answer Analysis**: The submitted answer mentions offloading gradients and trainable parameters to the CPU, which allows for more efficient use of GPU memory. It specifies the use of `\u001b[0m\u001b[32moffload_gradients\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m` to free gradients after device-to-host transfer. Additionally, it introduces the concept of using paged Adam with `\u001b[0m\u001b[32moptimizer_in_bwd\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m` to help reduce memory usage. It also notes that actual memory usage may vary depending on the use case and model architecture.\\n\\n3. **Comparison**:\\n   - Both answers mention offloading gradients to the CPU using `\u001b[0m\u001b[32moffload_gradients\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m`.\\n   - The expert answer focuses on keeping optimizer states and performing optimizer steps on the CPU, while the submitted answer expands on this by mentioning trainable parameters and the use of paged Adam.\\n   - The submitted answer provides additional context about memory usage variability and the use of paged Adam, which is not mentioned in the expert answer.\\n\\n4. **Conclusion**: The submitted answer is a superset of the expert answer as it includes all the information from the expert answer and adds more details about trainable parameters, paged Adam, and memory usage variability. There is no conflict between the two answers, and the additional information in the submitted answer is consistent with the expert answer.\\n\\nTherefore, the correct choice is \u001b[0m\u001b[32m(\u001b[0m\u001b[32mB\u001b[0m\u001b[32m)\u001b[0m\u001b[32m The submitted answer is a superset of the expert answer and is fully consistent with it.'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m\"1. **Expert Answer Analysis**: The expert answer provides a method to ensure only LoRA parameters are trainable by using torchtune's utility functions. It mentions fetching LoRA parameters with `get_adapter_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model\u001b[0m\u001b[32m)\u001b[0m\u001b[32m` and setting them as trainable with `set_trainable_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model, lora_params\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`. It also notes that the LoRA recipe handles this automatically.\\n\\n2. **Submitted Answer Analysis**: The submitted answer provides a detailed example of how to ensure only LoRA parameters are trainable. It uses the `set_trainable_params` function from `torchtune.modules.peft.peft_utils` and provides a code example that includes loading a model and adapter, fetching adapter parameters, and setting them as trainable.\\n\\n3. **Comparison**:\\n   - Both answers mention the use of `set_trainable_params` to set LoRA parameters as trainable.\\n   - Both answers involve fetching LoRA parameters using a function \u001b[0m\u001b[32m(\u001b[0m\u001b[32m`get_adapter_params`\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n   - The submitted answer provides additional context by including a code example and specifying the module path for the functions used.\\n   - The expert answer mentions that the LoRA recipe handles this automatically, which is not explicitly stated in the submitted answer.\\n\\n4. **Conclusion**: The submitted answer is a superset of the expert answer. It includes all the information from the expert answer and adds more detail, such as a code example and specific module paths. There is no conflict between the two answers, and the additional information in the submitted answer is consistent with the expert answer.\"\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[1m]\u001b[0m\n",
+       "\u001b[2;32m│   │   \u001b[0m\u001b[1m)\u001b[0m\n",
+       "\u001b[2;32m│   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[1m)\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "eval_rows = []\n",
+    "for i, session_id in enumerate(search_agent.sessions):\n",
+    "    session_response = client.agents.session.retrieve(agent_id=search_agent.agent_id, session_id=session_id)\n",
+    "    for turn in session_response.turns:\n",
+    "        eval_rows.append({\n",
+    "            \"input_query\": examples[i][\"input_query\"],\n",
+    "            \"expected_answer\": examples[i][\"expected_answer\"],\n",
+    "            \"generated_answer\": turn.output_message.content,\n",
+    "        })\n",
+    "\n",
+    "scoring_params = {\n",
+    "    \"braintrust::factuality\": None,\n",
+    "}\n",
+    "scoring_response = client.scoring.score(\n",
+    "    input_rows=eval_rows,\n",
+    "    scoring_functions=scoring_params,\n",
+    ")\n",
+    "pprint(scoring_response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3. RAG Agent\n",
+    "\n",
+    "Now, let's see how we can improve the agent's performance by adding a RAG tool that explicitly retrieves information from Torchtune's documentation. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_stack_client.types import Document\n",
+    "urls = [\n",
+    "    \"memory_optimizations.rst\",\n",
+    "    \"chat.rst\",\n",
+    "    \"llama3.rst\",\n",
+    "    \"datasets.rst\",\n",
+    "    \"qat_finetune.rst\",\n",
+    "    \"lora_finetune.rst\",\n",
+    "]\n",
+    "documents = [\n",
+    "    Document(\n",
+    "        document_id=f\"num-{i}\",\n",
+    "        content=f\"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}\",\n",
+    "        mime_type=\"text/plain\",\n",
+    "        metadata={},\n",
+    "    )\n",
+    "    for i, url in enumerate(urls)\n",
+    "]\n",
+    "\n",
+    "vector_providers = [\n",
+    "    provider for provider in client.providers.list() if provider.api == \"vector_io\"\n",
+    "]\n",
+    "selected_vector_provider = vector_providers[0]\n",
+    "vector_db_id = f\"test_vector_db_{uuid.uuid4()}\"\n",
+    "client.vector_dbs.register(\n",
+    "    vector_db_id=vector_db_id,\n",
+    "    embedding_model=\"all-MiniLM-L6-v2\",\n",
+    "    embedding_dimension=384,\n",
+    "    provider_id=selected_vector_provider.provider_id,\n",
+    ")\n",
+    "\n",
+    "client.tool_runtime.rag_tool.insert(\n",
+    "    documents=documents,\n",
+    "    vector_db_id=vector_db_id,\n",
+    "    chunk_size_in_tokens=512,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">Question:</span> What precision formats does torchtune support?\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;36mQuestion:\u001b[0m What precision formats does torchtune support?\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> Torchtune supports the following precision formats:\n",
+       "\n",
+       "* bfloat16 <span style=\"font-weight: bold\">(</span>half-precision<span style=\"font-weight: bold\">)</span>\n",
+       "* fp32 <span style=\"font-weight: bold\">(</span>full-precision<span style=\"font-weight: bold\">)</span>\n",
+       "* int8 <span style=\"font-weight: bold\">(</span>integer <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">8</span>-bit<span style=\"font-weight: bold\">)</span>\n",
+       "* int4 <span style=\"font-weight: bold\">(</span>integer <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">4</span>-bit<span style=\"font-weight: bold\">)</span>\n",
+       "\n",
+       "Note that mixed-precision training is not currently supported in torchtune.\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;33mAgent Answer:\u001b[0m Torchtune supports the following precision formats:\n",
+       "\n",
+       "* bfloat16 \u001b[1m(\u001b[0mhalf-precision\u001b[1m)\u001b[0m\n",
+       "* fp32 \u001b[1m(\u001b[0mfull-precision\u001b[1m)\u001b[0m\n",
+       "* int8 \u001b[1m(\u001b[0minteger \u001b[1;36m8\u001b[0m-bit\u001b[1m)\u001b[0m\n",
+       "* int4 \u001b[1m(\u001b[0minteger \u001b[1;36m4\u001b[0m-bit\u001b[1m)\u001b[0m\n",
+       "\n",
+       "Note that mixed-precision training is not currently supported in torchtune.\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">Question:</span> What does DoRA stand for in torchtune?\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;36mQuestion:\u001b[0m What does DoRA stand for in torchtune?\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> DoRA stands for <span style=\"color: #008000; text-decoration-color: #008000\">\"Decoupled Orthogonal Random Axes\"</span> in the context of the Torchtune project.\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;33mAgent Answer:\u001b[0m DoRA stands for \u001b[32m\"Decoupled Orthogonal Random Axes\"\u001b[0m in the context of the Torchtune project.\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">Question:</span> How does the CPUOffloadOptimizer reduce GPU memory usage?\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;36mQuestion:\u001b[0m How does the CPUOffloadOptimizer reduce GPU memory usage?\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> The CPUOffloadOptimizer reduces GPU memory usage by offloading optimizer states and gradients to CPU,\n",
+       "thus reducing the memory usage on the GPU. This is especially useful when training large models or when using \n",
+       "stateful optimizers, as it can significantly reduce the memory requirements. However, it may come at the cost of \n",
+       "increased CPU RAM usage and potentially slower training speeds.\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;33mAgent Answer:\u001b[0m The CPUOffloadOptimizer reduces GPU memory usage by offloading optimizer states and gradients to CPU,\n",
+       "thus reducing the memory usage on the GPU. This is especially useful when training large models or when using \n",
+       "stateful optimizers, as it can significantly reduce the memory requirements. However, it may come at the cost of \n",
+       "increased CPU RAM usage and potentially slower training speeds.\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">Question:</span> How do I ensure only LoRA parameters are trainable when fine-tuning?\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;36mQuestion:\u001b[0m How do I ensure only LoRA parameters are trainable when fine-tuning?\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> To ensure only LoRA parameters are trainable when fine-tuning, you can use the `get_adapter_params` \n",
+       "and `set_trainable_params` functions from `torchtune.modules.peft.peft_utils`. \n",
+       "\n",
+       "Here is how to do it:\n",
+       "\n",
+       "```python\n",
+       "from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n",
+       "\n",
+       "# Fetch all params from the model that are associated with LoRA.\n",
+       "lora_params = <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">get_adapter_params</span><span style=\"font-weight: bold\">(</span>lora_model<span style=\"font-weight: bold\">)</span>\n",
+       "\n",
+       "# Set <span style=\"color: #808000; text-decoration-color: #808000\">requires_grad</span>=<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span> on lora_params, and <span style=\"color: #808000; text-decoration-color: #808000\">requires_grad</span>=<span style=\"color: #ff0000; text-decoration-color: #ff0000; font-style: italic\">False</span> on all others.\n",
+       "<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">set_trainable_params</span><span style=\"font-weight: bold\">(</span>lora_model, lora_params<span style=\"font-weight: bold\">)</span>\n",
+       "```\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;33mAgent Answer:\u001b[0m To ensure only LoRA parameters are trainable when fine-tuning, you can use the `get_adapter_params` \n",
+       "and `set_trainable_params` functions from `torchtune.modules.peft.peft_utils`. \n",
+       "\n",
+       "Here is how to do it:\n",
+       "\n",
+       "```python\n",
+       "from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n",
+       "\n",
+       "# Fetch all params from the model that are associated with LoRA.\n",
+       "lora_params = \u001b[1;35mget_adapter_params\u001b[0m\u001b[1m(\u001b[0mlora_model\u001b[1m)\u001b[0m\n",
+       "\n",
+       "# Set \u001b[33mrequires_grad\u001b[0m=\u001b[3;92mTrue\u001b[0m on lora_params, and \u001b[33mrequires_grad\u001b[0m=\u001b[3;91mFalse\u001b[0m on all others.\n",
+       "\u001b[1;35mset_trainable_params\u001b[0m\u001b[1m(\u001b[0mlora_model, lora_params\u001b[1m)\u001b[0m\n",
+       "```\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "rag_agent = Agent(\n",
+    "    client,\n",
+    "    model=MODEL_ID,\n",
+    "    instructions=\"You are a helpful assistant that can answer questions about the Torchtune project. You should always use the RAG tool to answer questions.\",\n",
+    "    tools=[{\n",
+    "        \"name\": \"builtin::rag\",\n",
+    "        \"args\": {\"vector_db_ids\": [vector_db_id]},\n",
+    "    }],\n",
+    ")\n",
+    "\n",
+    "for example in examples:\n",
+    "    rag_session_id = rag_agent.create_session(session_name=f\"rag_session_{uuid.uuid4()}\")\n",
+    "    response = rag_agent.create_turn(\n",
+    "        messages=[\n",
+    "            {\n",
+    "                \"role\": \"user\",\n",
+    "                \"content\": example[\"input_query\"]\n",
+    "            }\n",
+    "        ],\n",
+    "        session_id=rag_session_id,\n",
+    "        stream=False\n",
+    "    )\n",
+    "    rich.print(f\"[bold cyan]Question:[/bold cyan] {example['input_query']}\")\n",
+    "    rich.print(f\"[bold yellow]Agent Answer:[/bold yellow] {response.output_message.content}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">ScoringScoreResponse</span><span style=\"font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"color: #808000; text-decoration-color: #808000\">results</span>=<span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'braintrust::factuality'</span>: <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">ScoringResult</span><span style=\"font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">aggregated_results</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'average'</span>: <span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'average'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.3</span><span style=\"font-weight: bold\">}}</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">score_rows</span>=<span style=\"font-weight: bold\">[</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.0</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'D'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'1. The expert answer states that Torchtune supports two precision formats: fp32 and bfloat16.\\n2. The submitted answer lists four precision formats: bfloat16, fp32, int8, and int4.\\n3. The submitted answer includes the two formats mentioned by the expert (bfloat16 and fp32), but also adds int8 and int4, which are not mentioned by the expert.\\n4. The submitted answer also states that mixed-precision training is not supported, which is not addressed in the expert answer.\\n5. Since the submitted answer includes additional precision formats (int8 and int4) that are not mentioned by the expert, there is a factual disagreement between the two answers regarding the supported precision formats.\\n6. Therefore, the correct choice is (D) There is a disagreement between the submitted answer and the expert answer.'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.0</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'D'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'1. The expert answer states that DoRA stands for \"Weight-Decomposed Low-Rank Adaptation.\"\\n2. The submitted answer states that DoRA stands for \"Decoupled Orthogonal Random Axes.\"\\n3. The two answers provide completely different expansions for the acronym DoRA.\\n4. Since the expansions are different, there is a clear disagreement between the submitted answer and the expert answer.\\n5. Therefore, the correct choice is (D) There is a disagreement between the submitted answer and the expert answer.'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.6</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'B'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'1. The expert answer states that the CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on CPU and performing optimizer steps on CPU. It also mentions the optional offloading of gradients to CPU using offload_gradients=True.\\n2. The submitted answer states that the CPUOffloadOptimizer reduces GPU memory usage by offloading optimizer states and gradients to CPU. It also mentions that this is useful for large models or stateful optimizers and notes potential downsides like increased CPU RAM usage and slower training speeds.\\n3. The submitted answer includes all the points mentioned in the expert answer: offloading optimizer states and optionally gradients to CPU.\\n4. Additionally, the submitted answer provides extra context about the usefulness for large models and potential downsides, which are not mentioned in the expert answer.\\n5. There is no factual disagreement between the two answers; the submitted answer simply provides more information.\\n\\nBased on this analysis, the submitted answer is a superset of the expert answer and is fully consistent with it.'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.6</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'B'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">\"1. **Identify the core content of both answers:**\\n   - The expert answer explains how to set only LoRA parameters as trainable using torchtune's utility functions by fetching all LoRA parameters with `get_adapter_params(lora_model)` and setting them as trainable with `set_trainable_params(lora_model, lora_params)`. It also mentions that the LoRA recipe handles this automatically.\\n   - The submitted answer provides a similar explanation, detailing the use of `get_adapter_params` and `set_trainable_params` from `torchtune.modules.peft.peft_utils` to ensure only LoRA parameters are trainable. It includes a code snippet demonstrating the process.\\n\\n2. **Compare the factual content:**\\n   - Both answers describe the same process of fetching LoRA parameters and setting them as trainable using the same functions.\\n   - The submitted answer includes additional details such as the import statement and a code snippet, which are not present in the expert answer.\\n   - The expert answer mentions that the LoRA recipe handles this automatically, which is not mentioned in the submission.\\n\\n3. **Determine the relationship between the answers:**\\n   - The submitted answer is a superset of the expert answer because it includes all the information provided by the expert and adds more details, such as the import statement and code snippet.\\n   - There is no conflict between the two answers; the submission expands on the expert's explanation.\\n\\nBased on this analysis, the submitted answer is a superset of the expert answer and is fully consistent with it.\"</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"font-weight: bold\">]</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"font-weight: bold\">)</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"font-weight: bold\">}</span>\n",
+       "<span style=\"font-weight: bold\">)</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1;35mScoringScoreResponse\u001b[0m\u001b[1m(\u001b[0m\n",
+       "\u001b[2;32m│   \u001b[0m\u001b[33mresults\u001b[0m=\u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   \u001b[0m\u001b[32m'braintrust::factuality'\u001b[0m: \u001b[1;35mScoringResult\u001b[0m\u001b[1m(\u001b[0m\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[33maggregated_results\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'average'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'average'\u001b[0m: \u001b[1;36m0.3\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[33mscore_rows\u001b[0m=\u001b[1m[\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.0\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'D'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. The expert answer states that Torchtune supports two precision formats: fp32 and bfloat16.\\n2. The submitted answer lists four precision formats: bfloat16, fp32, int8, and int4.\\n3. The submitted answer includes the two formats mentioned by the expert \u001b[0m\u001b[32m(\u001b[0m\u001b[32mbfloat16 and fp32\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, but also adds int8 and int4, which are not mentioned by the expert.\\n4. The submitted answer also states that mixed-precision training is not supported, which is not addressed in the expert answer.\\n5. Since the submitted answer includes additional precision formats \u001b[0m\u001b[32m(\u001b[0m\u001b[32mint8 and int4\u001b[0m\u001b[32m)\u001b[0m\u001b[32m that are not mentioned by the expert, there is a factual disagreement between the two answers regarding the supported precision formats.\\n6. Therefore, the correct choice is \u001b[0m\u001b[32m(\u001b[0m\u001b[32mD\u001b[0m\u001b[32m)\u001b[0m\u001b[32m There is a disagreement between the submitted answer and the expert answer.'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.0\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'D'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. The expert answer states that DoRA stands for \"Weight-Decomposed Low-Rank Adaptation.\"\\n2. The submitted answer states that DoRA stands for \"Decoupled Orthogonal Random Axes.\"\\n3. The two answers provide completely different expansions for the acronym DoRA.\\n4. Since the expansions are different, there is a clear disagreement between the submitted answer and the expert answer.\\n5. Therefore, the correct choice is \u001b[0m\u001b[32m(\u001b[0m\u001b[32mD\u001b[0m\u001b[32m)\u001b[0m\u001b[32m There is a disagreement between the submitted answer and the expert answer.'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. The expert answer states that the CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on CPU and performing optimizer steps on CPU. It also mentions the optional offloading of gradients to CPU using \u001b[0m\u001b[32moffload_gradients\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m.\\n2. The submitted answer states that the CPUOffloadOptimizer reduces GPU memory usage by offloading optimizer states and gradients to CPU. It also mentions that this is useful for large models or stateful optimizers and notes potential downsides like increased CPU RAM usage and slower training speeds.\\n3. The submitted answer includes all the points mentioned in the expert answer: offloading optimizer states and optionally gradients to CPU.\\n4. Additionally, the submitted answer provides extra context about the usefulness for large models and potential downsides, which are not mentioned in the expert answer.\\n5. There is no factual disagreement between the two answers; the submitted answer simply provides more information.\\n\\nBased on this analysis, the submitted answer is a superset of the expert answer and is fully consistent with it.'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m\"1. **Identify the core content of both answers:**\\n   - The expert answer explains how to set only LoRA parameters as trainable using torchtune's utility functions by fetching all LoRA parameters with `get_adapter_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model\u001b[0m\u001b[32m)\u001b[0m\u001b[32m` and setting them as trainable with `set_trainable_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model, lora_params\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`. It also mentions that the LoRA recipe handles this automatically.\\n   - The submitted answer provides a similar explanation, detailing the use of `get_adapter_params` and `set_trainable_params` from `torchtune.modules.peft.peft_utils` to ensure only LoRA parameters are trainable. It includes a code snippet demonstrating the process.\\n\\n2. **Compare the factual content:**\\n   - Both answers describe the same process of fetching LoRA parameters and setting them as trainable using the same functions.\\n   - The submitted answer includes additional details such as the import statement and a code snippet, which are not present in the expert answer.\\n   - The expert answer mentions that the LoRA recipe handles this automatically, which is not mentioned in the submission.\\n\\n3. **Determine the relationship between the answers:**\\n   - The submitted answer is a superset of the expert answer because it includes all the information provided by the expert and adds more details, such as the import statement and code snippet.\\n   - There is no conflict between the two answers; the submission expands on the expert's explanation.\\n\\nBased on this analysis, the submitted answer is a superset of the expert answer and is fully consistent with it.\"\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[1m]\u001b[0m\n",
+       "\u001b[2;32m│   │   \u001b[0m\u001b[1m)\u001b[0m\n",
+       "\u001b[2;32m│   \u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[1m)\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "eval_rows = []\n",
+    "for i, session_id in enumerate(rag_agent.sessions):\n",
+    "    session_response = client.agents.session.retrieve(agent_id=rag_agent.agent_id, session_id=session_id)\n",
+    "    for turn in session_response.turns:\n",
+    "        eval_rows.append({\n",
+    "            \"input_query\": examples[i][\"input_query\"],\n",
+    "            \"expected_answer\": examples[i][\"expected_answer\"],\n",
+    "            \"generated_answer\": turn.output_message.content,\n",
+    "        })\n",
+    "\n",
+    "scoring_params = {\n",
+    "    \"braintrust::factuality\": None,\n",
+    "}\n",
+    "scoring_response = client.scoring.score(\n",
+    "    input_rows=eval_rows,\n",
+    "    scoring_functions=scoring_params,\n",
+    ")\n",
+    "pprint(scoring_response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Deep dive into RAG Tool Performance\n",
+    "- Now, let's take a closer look at how the RAG tool is doing, specifically on the second example where the agent's answer is not correct on identifying what DoRA stands for. \n",
+    "- Notice that the issue lies with the retrieval step, where the retrieved document is not relevant to the question. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">[</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">Turn</span><span style=\"font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">input_messages</span>=<span style=\"font-weight: bold\">[</span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">UserMessage</span><span style=\"font-weight: bold\">(</span><span style=\"color: #808000; text-decoration-color: #808000\">content</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'What does DoRA stand for in torchtune?'</span>, <span style=\"color: #808000; text-decoration-color: #808000\">role</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'user'</span>, <span style=\"color: #808000; text-decoration-color: #808000\">context</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-style: italic\">None</span><span style=\"font-weight: bold\">)]</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">output_message</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">CompletionMessage</span><span style=\"font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">content</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'DoRA stands for \"Decoupled Orthogonal Random Axes\" in the context of the Torchtune project.'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">role</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'assistant'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">stop_reason</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'end_of_turn'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">tool_calls</span>=<span style=\"font-weight: bold\">[]</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"font-weight: bold\">)</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">session_id</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'b5b5b9c5-1f14-404a-9677-cdb413b9f328'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">started_at</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">datetime</span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">.datetime</span><span style=\"font-weight: bold\">(</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2025</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">7</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">10</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">35</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">24</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">235903</span>, <span style=\"color: #808000; text-decoration-color: #808000\">tzinfo</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">datetime</span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">.timezone</span><span style=\"font-weight: bold\">(</span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">datetime.timedelta</span><span style=\"font-weight: bold\">(</span><span style=\"color: #808000; text-decoration-color: #808000\">days</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">-1</span>, <span style=\"color: #808000; text-decoration-color: #808000\">seconds</span>=<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">57600</span><span style=\"font-weight: bold\">)))</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">steps</span>=<span style=\"font-weight: bold\">[</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">InferenceStep</span><span style=\"font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">api_model_response</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">CompletionMessage</span><span style=\"font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">content</span>=<span style=\"color: #008000; text-decoration-color: #008000\">''</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">role</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'assistant'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">stop_reason</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'end_of_turn'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">tool_calls</span>=<span style=\"font-weight: bold\">[</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">ToolCall</span><span style=\"font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">arguments</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'query'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'DoRA meaning in Torchtune'</span><span style=\"font-weight: bold\">}</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">call_id</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'c2c088b9-cf2f-41b5-a050-dd5743112f48'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">tool_name</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'knowledge_search'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"font-weight: bold\">)</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">]</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">)</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">step_id</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'27ba55cd-0252-4cff-8141-129b3b8dd021'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">step_type</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'inference'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">turn_id</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'bb111412-e2e9-40ca-9cd2-87df200807ab'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">completed_at</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">datetime</span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">.datetime</span><span style=\"font-weight: bold\">(</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2025</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">7</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">10</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">35</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">26</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">226185</span>, <span style=\"color: #808000; text-decoration-color: #808000\">tzinfo</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TzInfo</span><span style=\"font-weight: bold\">(</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">-08</span>:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">00</span><span style=\"font-weight: bold\">))</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">started_at</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">datetime</span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">.datetime</span><span style=\"font-weight: bold\">(</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2025</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">7</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">10</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">35</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">24</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">236359</span>, <span style=\"color: #808000; text-decoration-color: #808000\">tzinfo</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TzInfo</span><span style=\"font-weight: bold\">(</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">-08</span>:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">00</span><span style=\"font-weight: bold\">))</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"font-weight: bold\">)</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">ToolExecutionStep</span><span style=\"font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">step_id</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'e7da6bb1-a704-4a2e-9954-5d54d8a1fc5d'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">step_type</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'tool_execution'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">tool_calls</span>=<span style=\"font-weight: bold\">[</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">ToolCall</span><span style=\"font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">arguments</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'query'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'DoRA meaning in Torchtune'</span><span style=\"font-weight: bold\">}</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">call_id</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'c2c088b9-cf2f-41b5-a050-dd5743112f48'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">tool_name</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'knowledge_search'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">)</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">]</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">tool_responses</span>=<span style=\"font-weight: bold\">[</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">ToolResponse</span><span style=\"font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">call_id</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'c2c088b9-cf2f-41b5-a050-dd5743112f48'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">content</span>=<span style=\"font-weight: bold\">[</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TextContentItem</span><span style=\"font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">type</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'text'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   </span><span style=\"font-weight: bold\">)</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TextContentItem</span><span style=\"font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'Result 1:\\nDocument_id:num-0\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA &lt;glossary_lora&gt;` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP &lt;https://pytorch.org/docs/stable/fsdp.html&gt;`.\\n.. .. _glossary_fsdp2:\\n\\n'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">type</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'text'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TextContentItem</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'Result 2:\\nDocument_id:num-1\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\"json\",\\n        data_files=\"data/my_data.json\",\\n        split=\"train\",\\n        conversation_column=\"dialogue\",\\n        conversation_style=\"sharegpt\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset &lt;https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset&gt;`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations &lt;https://\\n'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">type</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'text'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TextContentItem</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">\"Result 3:\\nDocument_id:num-5\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA &lt;https://arxiv.org/abs/2106.09685&gt;`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune&lt;lora_recipe_label&gt;`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune&lt;overview_label&gt;`\\n      * Make sure to :ref:`install torchtune&lt;install_label&gt;`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights&lt;download_llama_label&gt;`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA &lt;https://arxiv.org/abs/2106.09685&gt;`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank &lt;https://en.wikipedia.org/wiki/Rank_(linear_algebra)&gt;`_\\n    and discussion of `low-rank approximations &lt;https://en.wikipedia.org/wiki/Low-rank_approximation&gt;`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW &lt;https://py\\n\"</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">type</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'text'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TextContentItem</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'Result 4:\\nDocument_id:num-0\\nContent:  use the :class:`torch.optim.AdamW` optimizer with ``fused=True`` as the base optimizer. For example, to use this optimizer to offload\\nboth optimizer states and gradients to CPU:\\n\\n.. code-block:: bash\\n\\n  tune run &lt;RECIPE&gt; --config &lt;CONFIG&gt; \\\\\\n  optimizer=optimizer=torchao.prototype.low_bit_optim.CPUOffloadOptimizer \\\\\\n  optimizer.offload_gradients=True \\\\\\n  lr=4e-5\\n\\n\\nor by directly :ref:`modifying a config file&lt;config_tutorial_label&gt;`:\\n\\n.. code-block:: yaml\\n\\n  optimizer:\\n    _component_: torchao.prototype.low_bit_optim.CPUOffloadOptimizer\\n    offload_gradients: True\\n    # additional key-word arguments can be passed to torch.optim.AdamW\\n    lr: 4e-5\\n\\nor using it directly in your code, which allows you to change the base optimizer:\\n\\n.. code-block:: python\\n\\n from torchao.prototype.low_bit_optim import CPUOffloadOptimizer\\n from torch.optim import Adam\\n\\n optimizer = CPUOffloadOptimizer(\\n     model.parameters(), # your model here\\n     Adam,\\n     lr=1e-5,\\n     fused=True\\n )\\n\\nSome helpful hints from the ``torchao`` `CPUOffloadOptimizer page &lt;https://github.com/pytorch/ao/tree/main/torchao/prototype/low_bit_optim#optimizer-cpu-offload&gt;`_:\\n\\n* The CPU optimizer step is often the bottleneck when optimizer CPU offload is used. To minimize the slowdown, it is recommended to (1) use full ``bf16`` training so that parameters, gradients, and optimizer states are in ``bf16``; and (2) give GPU more work per optimizer step to amortize the offloading time (e.g. larger batch size with activation checkpointing, gradient accumulation).\\n* Gradient accumulation should always be set to 1 when ``offload_gradients=True``, as gradients are cleared on GPU every backward pass.\\n* This optimizer works by keeping a copy of parameters and pre-allocating gradient memory on CPU. Therefore, expect your RAM usage to increase by 4x model size.\\n* This optimizer is only supported for single-device recipes. To use CPU-offloading in distributed recipes, use ``fsdp_cpu_offload=True`` instead. See :class:`torch.distributed.fsdp.FullyShardedDataParallel` for more details and `FSDP1 vs FSDP2 &lt;https://github.com/pytorch/torchtitan/blob/main/docs/fsdp\\n'</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">type</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'text'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   </span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">)</span><span style=\"color: #000000; text-decoration-color: #000000\">,</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TextContentItem</span><span style=\"color: #000000; text-decoration-color: #000000; font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">text</span><span style=\"color: #000000; text-decoration-color: #000000\">=</span><span style=\"color: #008000; text-decoration-color: #008000\">'Result 5:\\nDocument_id:num-5\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune\\'s LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() &lt;torchtune.modules.peft.validate_missing_and_unexpected_for_lora&gt;`.\\n\\nOnce we\\'ve loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\"\"\"\\n    {total_params} total params,\\n    {trainable_params}\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \"\"\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here&lt;lora_recipe_label&gt;`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe &lt;https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">type</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'text'</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   </span><span style=\"font-weight: bold\">)</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   │   </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TextContentItem</span><span style=\"font-weight: bold\">(</span><span style=\"color: #808000; text-decoration-color: #808000\">text</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'END of knowledge_search tool results.\\n'</span>, <span style=\"color: #808000; text-decoration-color: #808000\">type</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'text'</span><span style=\"font-weight: bold\">)</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"font-weight: bold\">]</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">tool_name</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'knowledge_search'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">metadata</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'document_ids'</span>: <span style=\"font-weight: bold\">[</span><span style=\"color: #008000; text-decoration-color: #008000\">'num-0'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'num-1'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'num-5'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'num-0'</span>, <span style=\"color: #008000; text-decoration-color: #008000\">'num-5'</span><span style=\"font-weight: bold\">]}</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"font-weight: bold\">)</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">]</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">turn_id</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'bb111412-e2e9-40ca-9cd2-87df200807ab'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">completed_at</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">datetime</span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">.datetime</span><span style=\"font-weight: bold\">(</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2025</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">7</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">10</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">35</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">26</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">339563</span>, <span style=\"color: #808000; text-decoration-color: #808000\">tzinfo</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TzInfo</span><span style=\"font-weight: bold\">(</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">-08</span>:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">00</span><span style=\"font-weight: bold\">))</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">started_at</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">datetime</span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">.datetime</span><span style=\"font-weight: bold\">(</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2025</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">7</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">10</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">35</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">26</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">264752</span>, <span style=\"color: #808000; text-decoration-color: #808000\">tzinfo</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TzInfo</span><span style=\"font-weight: bold\">(</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">-08</span>:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">00</span><span style=\"font-weight: bold\">))</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"font-weight: bold\">)</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">InferenceStep</span><span style=\"font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">api_model_response</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">CompletionMessage</span><span style=\"font-weight: bold\">(</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">content</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'DoRA stands for \"Decoupled Orthogonal Random Axes\" in the context of the Torchtune project.'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">role</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'assistant'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">stop_reason</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'end_of_turn'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">tool_calls</span>=<span style=\"font-weight: bold\">[]</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"font-weight: bold\">)</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">step_id</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'400e49e1-f33e-41da-b22a-f1d2338a27c8'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">step_type</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'inference'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">turn_id</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'bb111412-e2e9-40ca-9cd2-87df200807ab'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">completed_at</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">datetime</span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">.datetime</span><span style=\"font-weight: bold\">(</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2025</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">7</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">10</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">35</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">27</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">281430</span>, <span style=\"color: #808000; text-decoration-color: #808000\">tzinfo</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TzInfo</span><span style=\"font-weight: bold\">(</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">-08</span>:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">00</span><span style=\"font-weight: bold\">))</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">started_at</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">datetime</span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">.datetime</span><span style=\"font-weight: bold\">(</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2025</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">7</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">10</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">35</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">26</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">351029</span>, <span style=\"color: #808000; text-decoration-color: #808000\">tzinfo</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TzInfo</span><span style=\"font-weight: bold\">(</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">-08</span>:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">00</span><span style=\"font-weight: bold\">))</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   │   </span><span style=\"font-weight: bold\">)</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"font-weight: bold\">]</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">turn_id</span>=<span style=\"color: #008000; text-decoration-color: #008000\">'bb111412-e2e9-40ca-9cd2-87df200807ab'</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">completed_at</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">datetime</span><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">.datetime</span><span style=\"font-weight: bold\">(</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2025</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">7</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">10</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">35</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">27</span>, <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">294253</span>, <span style=\"color: #808000; text-decoration-color: #808000\">tzinfo</span>=<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">TzInfo</span><span style=\"font-weight: bold\">(</span><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">-08</span>:<span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">00</span><span style=\"font-weight: bold\">))</span>,\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">output_attachments</span>=<span style=\"font-weight: bold\">[]</span>\n",
+       "<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│   </span><span style=\"font-weight: bold\">)</span>\n",
+       "<span style=\"font-weight: bold\">]</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1m[\u001b[0m\n",
+       "\u001b[2;32m│   \u001b[0m\u001b[1;35mTurn\u001b[0m\u001b[1m(\u001b[0m\n",
+       "\u001b[2;32m│   │   \u001b[0m\u001b[33minput_messages\u001b[0m=\u001b[1m[\u001b[0m\u001b[1;35mUserMessage\u001b[0m\u001b[1m(\u001b[0m\u001b[33mcontent\u001b[0m=\u001b[32m'What does DoRA stand for in torchtune?'\u001b[0m, \u001b[33mrole\u001b[0m=\u001b[32m'user'\u001b[0m, \u001b[33mcontext\u001b[0m=\u001b[3;35mNone\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m,\n",
+       "\u001b[2;32m│   │   \u001b[0m\u001b[33moutput_message\u001b[0m=\u001b[1;35mCompletionMessage\u001b[0m\u001b[1m(\u001b[0m\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[33mcontent\u001b[0m=\u001b[32m'DoRA stands for \"Decoupled Orthogonal Random Axes\" in the context of the Torchtune project.'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[33mrole\u001b[0m=\u001b[32m'assistant'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[33mstop_reason\u001b[0m=\u001b[32m'end_of_turn'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[33mtool_calls\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
+       "\u001b[2;32m│   │   \u001b[0m\u001b[1m)\u001b[0m,\n",
+       "\u001b[2;32m│   │   \u001b[0m\u001b[33msession_id\u001b[0m=\u001b[32m'b5b5b9c5-1f14-404a-9677-cdb413b9f328'\u001b[0m,\n",
+       "\u001b[2;32m│   │   \u001b[0m\u001b[33mstarted_at\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m7\u001b[0m, \u001b[1;36m10\u001b[0m, \u001b[1;36m35\u001b[0m, \u001b[1;36m24\u001b[0m, \u001b[1;36m235903\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.timezone\u001b[0m\u001b[1m(\u001b[0m\u001b[1;35mdatetime.timedelta\u001b[0m\u001b[1m(\u001b[0m\u001b[33mdays\u001b[0m=\u001b[1;36m-1\u001b[0m, \u001b[33mseconds\u001b[0m=\u001b[1;36m57600\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m,\n",
+       "\u001b[2;32m│   │   \u001b[0m\u001b[33msteps\u001b[0m=\u001b[1m[\u001b[0m\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[1;35mInferenceStep\u001b[0m\u001b[1m(\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[33mapi_model_response\u001b[0m=\u001b[1;35mCompletionMessage\u001b[0m\u001b[1m(\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[33mcontent\u001b[0m=\u001b[32m''\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[33mrole\u001b[0m=\u001b[32m'assistant'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[33mstop_reason\u001b[0m=\u001b[32m'end_of_turn'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[33mtool_calls\u001b[0m=\u001b[1m[\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[1;35mToolCall\u001b[0m\u001b[1m(\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   \u001b[0m\u001b[33marguments\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'query'\u001b[0m: \u001b[32m'DoRA meaning in Torchtune'\u001b[0m\u001b[1m}\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   \u001b[0m\u001b[33mcall_id\u001b[0m=\u001b[32m'c2c088b9-cf2f-41b5-a050-dd5743112f48'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   \u001b[0m\u001b[33mtool_name\u001b[0m=\u001b[32m'knowledge_search'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[1m)\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m]\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m)\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[33mstep_id\u001b[0m=\u001b[32m'27ba55cd-0252-4cff-8141-129b3b8dd021'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[33mstep_type\u001b[0m=\u001b[32m'inference'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[33mturn_id\u001b[0m=\u001b[32m'bb111412-e2e9-40ca-9cd2-87df200807ab'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[33mcompleted_at\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m7\u001b[0m, \u001b[1;36m10\u001b[0m, \u001b[1;36m35\u001b[0m, \u001b[1;36m26\u001b[0m, \u001b[1;36m226185\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m-08\u001b[0m:\u001b[1;36m00\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[33mstarted_at\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m7\u001b[0m, \u001b[1;36m10\u001b[0m, \u001b[1;36m35\u001b[0m, \u001b[1;36m24\u001b[0m, \u001b[1;36m236359\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m-08\u001b[0m:\u001b[1;36m00\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[1m)\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[1;35mToolExecutionStep\u001b[0m\u001b[1m(\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[33mstep_id\u001b[0m=\u001b[32m'e7da6bb1-a704-4a2e-9954-5d54d8a1fc5d'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[33mstep_type\u001b[0m=\u001b[32m'tool_execution'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[33mtool_calls\u001b[0m=\u001b[1m[\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1;35mToolCall\u001b[0m\u001b[1m(\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[33marguments\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'query'\u001b[0m: \u001b[32m'DoRA meaning in Torchtune'\u001b[0m\u001b[1m}\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[33mcall_id\u001b[0m=\u001b[32m'c2c088b9-cf2f-41b5-a050-dd5743112f48'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[33mtool_name\u001b[0m=\u001b[32m'knowledge_search'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m)\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m]\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[33mtool_responses\u001b[0m=\u001b[1m[\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1;35mToolResponse\u001b[0m\u001b[1m(\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[33mcall_id\u001b[0m=\u001b[32m'c2c088b9-cf2f-41b5-a050-dd5743112f48'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[33mcontent\u001b[0m=\u001b[1m[\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   \u001b[0m\u001b[1;35mTextContentItem\u001b[0m\u001b[1m(\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   │   \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   │   \u001b[0m\u001b[33mtype\u001b[0m=\u001b[32m'text'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   \u001b[0m\u001b[1m)\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   \u001b[0m\u001b[1;35mTextContentItem\u001b[0m\u001b[1m(\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   │   \u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'Result 1:\\nDocument_id:num-0\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``\u001b[0m\u001b[32muse_dora\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.\u001b[0m\u001b[32muse_dora\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA \u001b[0m\u001b[32m<\u001b[0m\u001b[32mglossary_lora\u001b[0m\u001b[32m>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``\u001b[0m\u001b[32mquantize\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.\u001b[0m\u001b[32mapply_lora_to_mlp\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m \\\\\\n  model.\u001b[0m\u001b[32mlora_attn_modules\u001b[0m\u001b[32m=\u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\",\"k_proj\",\"v_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \\\\\\n  model.\u001b[0m\u001b[32mlora_rank\u001b[0m\u001b[32m=\u001b[0m\u001b[32m16\u001b[0m\u001b[32m \\\\\\n  model.\u001b[0m\u001b[32mlora_alpha\u001b[0m\u001b[32m=\u001b[0m\u001b[32m32\u001b[0m\u001b[32m \\\\\\n  model.\u001b[0m\u001b[32muse_dora\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m \\\\\\n  model.\u001b[0m\u001b[32mquantize_base\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: \u001b[0m\u001b[32m[\u001b[0m\u001b[32m\"q_proj\", \"k_proj\", \"v_proj\"\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we\\'ve enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``\u001b[0m\u001b[32muse_dora\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel \u001b[0m\u001b[32m(\u001b[0m\u001b[32mFSDP\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n'\u001b[0m\u001b[39m,\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   │   \u001b[0m\u001b[33mtype\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'text'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   \u001b[0m\u001b[1;35mTextContentItem\u001b[0m\u001b[1;39m(\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   │   \u001b[0m\u001b[33mtext\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'Result 2:\\nDocument_id:num-1\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\"\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n    ds = chat_dataset\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n        \u001b[0m\u001b[32mtokenizer\u001b[0m\u001b[32m=\u001b[0m\u001b[32mtokenizer\u001b[0m\u001b[32m,\\n        \u001b[0m\u001b[32msource\u001b[0m\u001b[32m=\u001b[0m\u001b[32m\"json\"\u001b[0m\u001b[32m,\\n        \u001b[0m\u001b[32mdata_files\u001b[0m\u001b[32m=\u001b[0m\u001b[32m\"data\u001b[0m\u001b[32m/my_data.json\",\\n        \u001b[0m\u001b[32msplit\u001b[0m\u001b[32m=\u001b[0m\u001b[32m\"train\"\u001b[0m\u001b[32m,\\n        \u001b[0m\u001b[32mconversation_column\u001b[0m\u001b[32m=\u001b[0m\u001b[32m\"dialogue\"\u001b[0m\u001b[32m,\\n        \u001b[0m\u001b[32mconversation_style\u001b[0m\u001b[32m=\u001b[0m\u001b[32m\"sharegpt\"\u001b[0m\u001b[32m,\\n    \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we\\'re fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral\\'s :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default \u001b[0m\u001b[32m(\u001b[0m\u001b[32m:class:`~torchtune.models.mistral.MistralChatTemplate`\u001b[0m\u001b[32m)\u001b[0m\u001b[32m to format\\nall messages according to their `recommendations <https://\\n'\u001b[0m\u001b[39m,\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   │   \u001b[0m\u001b[33mtype\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'text'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   \u001b[0m\u001b[1;35mTextContentItem\u001b[0m\u001b[1;39m(\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   │   \u001b[0m\u001b[33mtext\u001b[0m\u001b[39m=\u001b[0m\u001b[32m\"Result\u001b[0m\u001b[32m 3:\\nDocument_id:num-5\\nContent: .. _lora_finetune_label:\\n\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m============================\\nFine-Tuning Llama2 with LoRA\\\u001b[0m\u001b[32mn\u001b[0m\u001b[32m============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlinear_algebra\u001b[0m\u001b[32m)\u001b[0m\u001b[32m>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA \u001b[0m\u001b[32m(\u001b[0m\u001b[32mas opposed to finetuning all model parameters\u001b[0m\u001b[32m)\u001b[0m\u001b[32m,\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\"\u001b[0m\u001b[39m,\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   │   \u001b[0m\u001b[33mtype\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'text'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   \u001b[0m\u001b[1;35mTextContentItem\u001b[0m\u001b[1;39m(\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   │   \u001b[0m\u001b[33mtext\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'Result 4:\\nDocument_id:num-0\\nContent:  use the :class:`torch.optim.AdamW` optimizer with ``\u001b[0m\u001b[32mfused\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m`` as the base optimizer. For example, to use this optimizer to offload\\nboth optimizer states and gradients to CPU:\\n\\n.. code-block:: bash\\n\\n  tune run <RECIPE> --config <CONFIG> \\\\\\n  \u001b[0m\u001b[32moptimizer\u001b[0m\u001b[32m=\u001b[0m\u001b[32moptimizer\u001b[0m\u001b[32m=torchao.prototype.low_bit_optim.CPUOffloadOptimizer \\\\\\n  optimizer.\u001b[0m\u001b[32moffload_gradients\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m \\\\\\n  \u001b[0m\u001b[32mlr\u001b[0m\u001b[32m=\u001b[0m\u001b[32m4e\u001b[0m\u001b[32m-5\\n\\n\\nor by directly :ref:`modifying a config file<config_tutorial_label>`:\\n\\n.. code-block:: yaml\\n\\n  optimizer:\\n    _component_: torchao.prototype.low_bit_optim.CPUOffloadOptimizer\\n    offload_gradients: True\\n    # additional key-word arguments can be passed to torch.optim.AdamW\\n    lr: 4e-5\\n\\nor using it directly in your code, which allows you to change the base optimizer:\\n\\n.. code-block:: python\\n\\n from torchao.prototype.low_bit_optim import CPUOffloadOptimizer\\n from torch.optim import Adam\\n\\n optimizer = CPUOffloadOptimizer\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n     model.parameters\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, # your model here\\n     Adam,\\n     \u001b[0m\u001b[32mlr\u001b[0m\u001b[32m=\u001b[0m\u001b[32m1e\u001b[0m\u001b[32m-5,\\n     \u001b[0m\u001b[32mfused\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m\\n \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\nSome helpful hints from the ``torchao`` `CPUOffloadOptimizer page <https://github.com/pytorch/ao/tree/main/torchao/prototype/low_bit_optim#optimizer-cpu-offload>`_:\\n\\n* The CPU optimizer step is often the bottleneck when optimizer CPU offload is used. To minimize the slowdown, it is recommended to \u001b[0m\u001b[32m(\u001b[0m\u001b[32m1\u001b[0m\u001b[32m)\u001b[0m\u001b[32m use full ``bf16`` training so that parameters, gradients, and optimizer states are in ``bf16``; and \u001b[0m\u001b[32m(\u001b[0m\u001b[32m2\u001b[0m\u001b[32m)\u001b[0m\u001b[32m give GPU more work per optimizer step to amortize the offloading time \u001b[0m\u001b[32m(\u001b[0m\u001b[32me.g. larger batch size with activation checkpointing, gradient accumulation\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n* Gradient accumulation should always be set to 1 when ``\u001b[0m\u001b[32moffload_gradients\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m``, as gradients are cleared on GPU every backward pass.\\n* This optimizer works by keeping a copy of parameters and pre-allocating gradient memory on CPU. Therefore, expect your RAM usage to increase by 4x model size.\\n* This optimizer is only supported for single-device recipes. To use CPU-offloading in distributed recipes, use ``\u001b[0m\u001b[32mfsdp_cpu_offload\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m`` instead. See :class:`torch.distributed.fsdp.FullyShardedDataParallel` for more details and `FSDP1 vs FSDP2 <https://github.com/pytorch/torchtitan/blob/main/docs/fsdp\\n'\u001b[0m\u001b[39m,\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   │   \u001b[0m\u001b[33mtype\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'text'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   \u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m,\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   \u001b[0m\u001b[1;35mTextContentItem\u001b[0m\u001b[1;39m(\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   │   \u001b[0m\u001b[33mtext\u001b[0m\u001b[39m=\u001b[0m\u001b[32m'Result 5:\\nDocument_id:num-5\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict\u001b[0m\u001b[32m(\u001b[0m\u001b[32mbase_model.state_dict\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\u001b[32mstrict\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n.. note::\\n    Whenever loading weights with :code:`\u001b[0m\u001b[32mstrict\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune\\'s LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we\\'ve loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n  # Set \u001b[0m\u001b[32mrequires_grad\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m on lora_params, and \u001b[0m\u001b[32mrequires_grad\u001b[0m\u001b[32m=\u001b[0m\u001b[32mFalse\u001b[0m\u001b[32m on all others.\\n  set_trainable_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model, lora_params\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n  # Print the total number of parameters\\n  total_params = sum\u001b[0m\u001b[32m(\u001b[0m\u001b[32m[\u001b[0m\u001b[32mp.numel\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m for p in lora_model.parameters\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n  trainable_params = sum\u001b[0m\u001b[32m(\u001b[0m\u001b[32m[\u001b[0m\u001b[32mp.numel\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m for p in lora_model.parameters\u001b[0m\u001b[32m(\u001b[0m\u001b[32m)\u001b[0m\u001b[32m if p.requires_grad\u001b[0m\u001b[32m]\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n  print\u001b[0m\u001b[32m(\u001b[0m\u001b[32m\\n    f\"\"\"\\n    \u001b[0m\u001b[32m{\u001b[0m\u001b[32mtotal_params\u001b[0m\u001b[32m}\u001b[0m\u001b[32m total params,\\n    \u001b[0m\u001b[32m{\u001b[0m\u001b[32mtrainable_params\u001b[0m\u001b[32m}\u001b[0m\u001b[32m\" trainable params,\\n    \u001b[0m\u001b[32m{\u001b[0m\u001b[32m(\u001b[0m\u001b[32m100.0 * trainable_params / total_params\u001b[0m\u001b[32m)\u001b[0m\u001b[32m:.2f\u001b[0m\u001b[32m}\u001b[0m\u001b[32m% of all params are trainable.\\n    \"\"\"\\n  \u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe \u001b[0m\u001b[32m(\u001b[0m\u001b[32mas detailed :ref:`here<lora_recipe_label\u001b[0m\u001b[32m>\u001b[0m\u001b[32m`\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune\\'s `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   │   \u001b[0m\u001b[33mtype\u001b[0m=\u001b[32m'text'\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   \u001b[0m\u001b[1m)\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   │   \u001b[0m\u001b[1;35mTextContentItem\u001b[0m\u001b[1m(\u001b[0m\u001b[33mtext\u001b[0m=\u001b[32m'END of knowledge_search tool results.\\n'\u001b[0m, \u001b[33mtype\u001b[0m=\u001b[32m'text'\u001b[0m\u001b[1m)\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[1m]\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[33mtool_name\u001b[0m=\u001b[32m'knowledge_search'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   │   \u001b[0m\u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'document_ids'\u001b[0m: \u001b[1m[\u001b[0m\u001b[32m'num-0'\u001b[0m, \u001b[32m'num-1'\u001b[0m, \u001b[32m'num-5'\u001b[0m, \u001b[32m'num-0'\u001b[0m, \u001b[32m'num-5'\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[1m)\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m]\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[33mturn_id\u001b[0m=\u001b[32m'bb111412-e2e9-40ca-9cd2-87df200807ab'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[33mcompleted_at\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m7\u001b[0m, \u001b[1;36m10\u001b[0m, \u001b[1;36m35\u001b[0m, \u001b[1;36m26\u001b[0m, \u001b[1;36m339563\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m-08\u001b[0m:\u001b[1;36m00\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[33mstarted_at\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m7\u001b[0m, \u001b[1;36m10\u001b[0m, \u001b[1;36m35\u001b[0m, \u001b[1;36m26\u001b[0m, \u001b[1;36m264752\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m-08\u001b[0m:\u001b[1;36m00\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[1m)\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[1;35mInferenceStep\u001b[0m\u001b[1m(\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[33mapi_model_response\u001b[0m=\u001b[1;35mCompletionMessage\u001b[0m\u001b[1m(\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[33mcontent\u001b[0m=\u001b[32m'DoRA stands for \"Decoupled Orthogonal Random Axes\" in the context of the Torchtune project.'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[33mrole\u001b[0m=\u001b[32m'assistant'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[33mstop_reason\u001b[0m=\u001b[32m'end_of_turn'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   │   \u001b[0m\u001b[33mtool_calls\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[1m)\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[33mstep_id\u001b[0m=\u001b[32m'400e49e1-f33e-41da-b22a-f1d2338a27c8'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[33mstep_type\u001b[0m=\u001b[32m'inference'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[33mturn_id\u001b[0m=\u001b[32m'bb111412-e2e9-40ca-9cd2-87df200807ab'\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[33mcompleted_at\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m7\u001b[0m, \u001b[1;36m10\u001b[0m, \u001b[1;36m35\u001b[0m, \u001b[1;36m27\u001b[0m, \u001b[1;36m281430\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m-08\u001b[0m:\u001b[1;36m00\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m,\n",
+       "\u001b[2;32m│   │   │   │   \u001b[0m\u001b[33mstarted_at\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m7\u001b[0m, \u001b[1;36m10\u001b[0m, \u001b[1;36m35\u001b[0m, \u001b[1;36m26\u001b[0m, \u001b[1;36m351029\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m-08\u001b[0m:\u001b[1;36m00\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m\n",
+       "\u001b[2;32m│   │   │   \u001b[0m\u001b[1m)\u001b[0m\n",
+       "\u001b[2;32m│   │   \u001b[0m\u001b[1m]\u001b[0m,\n",
+       "\u001b[2;32m│   │   \u001b[0m\u001b[33mturn_id\u001b[0m=\u001b[32m'bb111412-e2e9-40ca-9cd2-87df200807ab'\u001b[0m,\n",
+       "\u001b[2;32m│   │   \u001b[0m\u001b[33mcompleted_at\u001b[0m=\u001b[1;35mdatetime\u001b[0m\u001b[1;35m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2025\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m7\u001b[0m, \u001b[1;36m10\u001b[0m, \u001b[1;36m35\u001b[0m, \u001b[1;36m27\u001b[0m, \u001b[1;36m294253\u001b[0m, \u001b[33mtzinfo\u001b[0m=\u001b[1;35mTzInfo\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m-08\u001b[0m:\u001b[1;36m00\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m,\n",
+       "\u001b[2;32m│   │   \u001b[0m\u001b[33moutput_attachments\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
+       "\u001b[2;32m│   \u001b[0m\u001b[1m)\u001b[0m\n",
+       "\u001b[1m]\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "session_response = client.agents.session.retrieve(agent_id=rag_agent.agent_id, session_id=rag_agent.sessions[1])\n",
+    "pprint(session_response.turns)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "master",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 1097912054a2a0918b94ce92ceb828508bfe0001 Mon Sep 17 00:00:00 2001
From: Charlie Doern <cdoern@redhat.com>
Date: Fri, 7 Mar 2025 14:05:58 -0500
Subject: [PATCH 10/58] refactor: display defaults in help text (#1480)

# What does this PR do?

using `formatter_class=argparse.ArgumentDefaultsHelpFormatter` displays
(default: DEFAULT_VALUE) for each flag. add this formatter class to
build and run to show users some default values like `conda`, `8321`,
etc

## Test Plan

ran locally with following output:

before:
```
llama stack run --help
usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--disable-ipv6] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE] [--tls-certfile TLS_CERTFILE]
                       [--image-type {conda,container,venv}]
                       config

Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.

positional arguments:
  config                Path to config file to use for the run

options:
  -h, --help            show this help message and exit
  --port PORT           Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. Defaults to 8321
  --image-name IMAGE_NAME
                        Name of the image to run. Defaults to the current conda environment
  --disable-ipv6        Disable IPv6 support
  --env KEY=VALUE       Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times.
  --tls-keyfile TLS_KEYFILE
                        Path to TLS key file for HTTPS
  --tls-certfile TLS_CERTFILE
                        Path to TLS certificate file for HTTPS
  --image-type {conda,container,venv}
                        Image Type used during the build. This can be either conda or container or venv.
```

after:
```
llama stack run --help
usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--disable-ipv6] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE] [--tls-certfile TLS_CERTFILE]
                       [--image-type {conda,container,venv}]
                       config

Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.

positional arguments:
  config                Path to config file to use for the run

options:
  -h, --help            show this help message and exit
  --port PORT           Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. (default: 8321)
  --image-name IMAGE_NAME
                        Name of the image to run. Defaults to the current conda environment (default: None)
  --disable-ipv6        Disable IPv6 support (default: False)
  --env KEY=VALUE       Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times. (default: [])
  --tls-keyfile TLS_KEYFILE
                        Path to TLS key file for HTTPS (default: None)
  --tls-certfile TLS_CERTFILE
                        Path to TLS certificate file for HTTPS (default: None)
  --image-type {conda,container,venv}
                        Image Type used during the build. This can be either conda or container or venv. (default: conda)
```

[//]: # (## Documentation)

Signed-off-by: Charlie Doern <cdoern@redhat.com>
---
 docs/source/distributions/building_distro.md | 42 ++++++++++----------
 llama_stack/cli/stack/build.py               |  2 +-
 llama_stack/cli/stack/run.py                 |  5 ++-
 3 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/docs/source/distributions/building_distro.md b/docs/source/distributions/building_distro.md
index 41c6a70bf..942596b59 100644
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@@ -51,25 +51,25 @@ The main points to consider are:
 
 ```
 llama stack build -h
-
-usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--list-templates]
-                         [--image-type {conda,container,venv}] [--image-name IMAGE_NAME] [--print-deps-only]
+usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--list-templates] [--image-type {conda,container,venv}] [--image-name IMAGE_NAME] [--print-deps-only] [--run]
 
 Build a Llama stack container
 
 options:
   -h, --help            show this help message and exit
-  --config CONFIG       Path to a config file to use for the build. You can find example configs in llama_stack/distributions/**/build.yaml.
-                        If this argument is not provided, you will be prompted to enter information interactively
-  --template TEMPLATE   Name of the example template config to use for build. You may use `llama stack build --list-templates` to check out the available templates
-  --list-templates      Show the available templates for building a Llama Stack distribution
+  --config CONFIG       Path to a config file to use for the build. You can find example configs in llama_stack/distributions/**/build.yaml. If this argument is not provided, you will
+                        be prompted to enter information interactively (default: None)
+  --template TEMPLATE   Name of the example template config to use for build. You may use `llama stack build --list-templates` to check out the available templates (default: None)
+  --list-templates      Show the available templates for building a Llama Stack distribution (default: False)
   --image-type {conda,container,venv}
-                        Image Type to use for the build. This can be either conda or container or venv. If not specified, will use the image type from the template config.
+                        Image Type to use for the build. This can be either conda or container or venv. If not specified, will use the image type from the template config. (default:
+                        conda)
   --image-name IMAGE_NAME
-                        [for image-type=conda] Name of the conda environment to use for the build. If
-                        not specified, currently active Conda environment will be used. If no Conda
-                        environment is active, you must specify a name.
-  --print-deps-only     Print the dependencies for the stack only, without building the stack
+                        [for image-type=conda|venv] Name of the conda or virtual environment to use for the build. If not specified, currently active Conda environment will be used if
+                        found. (default: None)
+  --print-deps-only     Print the dependencies for the stack only, without building the stack (default: False)
+  --run                 Run the stack after building using the same image type, name, and other applicable arguments (default: False)
+
 ```
 
 After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command.
@@ -212,8 +212,8 @@ Now, let's start the Llama Stack Distribution Server. You will need the YAML con
 
 ```
 llama stack run -h
-usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--disable-ipv6] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE]
-                       [--tls-certfile TLS_CERTFILE] [--image-type {conda,container,venv}]
+usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--disable-ipv6] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE] [--tls-certfile TLS_CERTFILE]
+                       [--image-type {conda,container,venv}]
                        config
 
 Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.
@@ -223,17 +223,17 @@ positional arguments:
 
 options:
   -h, --help            show this help message and exit
-  --port PORT           Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. Defaults to 8321
+  --port PORT           Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. (default: 8321)
   --image-name IMAGE_NAME
-                        Name of the image to run. Defaults to the current conda environment
-  --disable-ipv6        Disable IPv6 support
-  --env KEY=VALUE       Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times.
+                        Name of the image to run. Defaults to the current conda environment (default: None)
+  --disable-ipv6        Disable IPv6 support (default: False)
+  --env KEY=VALUE       Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times. (default: [])
   --tls-keyfile TLS_KEYFILE
-                        Path to TLS key file for HTTPS
+                        Path to TLS key file for HTTPS (default: None)
   --tls-certfile TLS_CERTFILE
-                        Path to TLS certificate file for HTTPS
+                        Path to TLS certificate file for HTTPS (default: None)
   --image-type {conda,container,venv}
-                        Image Type used during the build. This can be either conda or container or venv.
+                        Image Type used during the build. This can be either conda or container or venv. (default: conda)
 
 ```
 
diff --git a/llama_stack/cli/stack/build.py b/llama_stack/cli/stack/build.py
index 61847a55d..70d74c620 100644
--- a/llama_stack/cli/stack/build.py
+++ b/llama_stack/cli/stack/build.py
@@ -16,7 +16,7 @@ class StackBuild(Subcommand):
             "build",
             prog="llama stack build",
             description="Build a Llama stack container",
-            formatter_class=argparse.RawTextHelpFormatter,
+            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         )
         self._add_arguments()
         self.parser.set_defaults(func=self._run_stack_build_command)
diff --git a/llama_stack/cli/stack/run.py b/llama_stack/cli/stack/run.py
index d4e679e4b..ba2273003 100644
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@@ -23,7 +23,7 @@ class StackRun(Subcommand):
             "run",
             prog="llama stack run",
             description="""Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.""",
-            formatter_class=argparse.RawTextHelpFormatter,
+            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         )
         self._add_arguments()
         self.parser.set_defaults(func=self._run_stack_run_cmd)
@@ -37,12 +37,13 @@ class StackRun(Subcommand):
         self.parser.add_argument(
             "--port",
             type=int,
-            help="Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. Defaults to 8321",
+            help="Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT.",
             default=int(os.getenv("LLAMA_STACK_PORT", 8321)),
         )
         self.parser.add_argument(
             "--image-name",
             type=str,
+            default=os.environ.get("CONDA_DEFAULT_ENV"),
             help="Name of the image to run. Defaults to the current conda environment",
         )
         self.parser.add_argument(

From fbd47bb4b644939b29260333b064d5d95a49c0fb Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Fri, 7 Mar 2025 11:10:07 -0800
Subject: [PATCH 11/58] feat(agent): plain function as client tool (#1479)

Summary:
support added in
https://github.com/meta-llama/llama-stack-client-python/pull/187

Test Plan:

LLAMA_STACK_CONFIG=fireworks pytest -s -v
tests/integration/agents/test_agents.py --safety-shield
meta-llama/Llama-Guard-3-8B --text-model
meta-llama/Llama-3.1-8B-Instruct
---
 docs/source/building_applications/tools.md    |    8 +-
 tests/integration/agents/test_agents.py       |    3 -
 .../recorded_responses/chat_completion.json   | 5910 ++++++++++-------
 .../recorded_responses/invoke_tool.json       |   68 +-
 4 files changed, 3439 insertions(+), 2550 deletions(-)

diff --git a/docs/source/building_applications/tools.md b/docs/source/building_applications/tools.md
index da447973d..2d7313cb8 100644
--- a/docs/source/building_applications/tools.md
+++ b/docs/source/building_applications/tools.md
@@ -127,15 +127,11 @@ MCP tools require:
 
 ## Adding Custom Tools
 
-When you want to use tools other than the built-in tools, you can implement a python function and decorate it with `@client_tool`.
+When you want to use tools other than the built-in tools, you just need to implement a python function with a docstring. The content of the docstring will be used to describe the tool and the parameters and passed
+along to the generative model.
 
-To define a custom tool, you need to use the `@client_tool` decorator.
 ```python
-from llama_stack_client.lib.agents.client_tool import client_tool
-
-
 # Example tool definition
-@client_tool
 def my_tool(input: int) -> int:
     """
     Runs my awesome tool.
diff --git a/tests/integration/agents/test_agents.py b/tests/integration/agents/test_agents.py
index 718f50872..a542e5403 100644
--- a/tests/integration/agents/test_agents.py
+++ b/tests/integration/agents/test_agents.py
@@ -9,7 +9,6 @@ from uuid import uuid4
 
 import pytest
 from llama_stack_client.lib.agents.agent import Agent
-from llama_stack_client.lib.agents.client_tool import client_tool
 from llama_stack_client.lib.agents.event_logger import EventLogger
 from llama_stack_client.types.agents.turn_create_params import Document as AgentDocument
 from llama_stack_client.types.memory_insert_params import Document
@@ -23,7 +22,6 @@ from llama_stack.apis.agents.agents import (
 )
 
 
-@client_tool
 def get_boiling_point(liquid_name: str, celcius: bool = True) -> int:
     """
     Returns the boiling point of a liquid in Celcius or Fahrenheit
@@ -41,7 +39,6 @@ def get_boiling_point(liquid_name: str, celcius: bool = True) -> int:
         return -1
 
 
-@client_tool
 def get_boiling_point_with_metadata(liquid_name: str, celcius: bool = True) -> Dict[str, Any]:
     """
     Returns the boiling point of a liquid in Celcius or Fahrenheit
diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.json b/tests/integration/fixtures/recorded_responses/chat_completion.json
index b4660d3a9..db45bbdf7 100644
--- a/tests/integration/fixtures/recorded_responses/chat_completion.json
+++ b/tests/integration/fixtures/recorded_responses/chat_completion.json
@@ -12500,7 +12500,27 @@
         "data": {
           "event": {
             "delta": {
-              "text": " boiling point of polyjuice is -100 degrees Fahrenheit.",
+              "text": " boiling point of polyjuice is -100",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " degrees Fahrenheit.",
               "type": "text"
             },
             "event_type": {
@@ -12535,59 +12555,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "ehKvLn9e",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:07.946658+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "gYfhKRXmT0qqnh4V",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 139
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "ehKvLn9e",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:07.946690+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "gYfhKRXmT0qqnh4V",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 23
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "ehKvLn9e",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:07.946698+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "gYfhKRXmT0qqnh4V",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 162
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -12641,7 +12609,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "type\": \"function\", \"name\": \"get_boiling_point\", \"",
+              "text": "type\": \"function\", \"name\": \"get_boiling",
               "type": "text"
             },
             "event_type": {
@@ -12661,7 +12629,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "parameters\": {\"liquid_name\": \"polyju",
+              "text": "_point\", \"parameters\": {\"liquid_name",
               "type": "text"
             },
             "event_type": {
@@ -12681,7 +12649,27 @@
         "data": {
           "event": {
             "delta": {
-              "text": "ice\", \"celcius\": \"false\"}}",
+              "text": "\": \"polyjuice\", \"cel",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "cius\": \"false\"}}",
               "type": "text"
             },
             "event_type": {
@@ -12711,7 +12699,7 @@
                   "celcius": "false",
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "ccb7e766-3cbd-4cd1-ac24-7d59fdbd32dd",
+                "call_id": "e8500d03-6e74-427c-b295-77bceca074f0",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -12752,59 +12740,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "f8N9xscj",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:06.326554+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "pbTGwscoS2O-TOD7",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 91
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "f8N9xscj",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:06.326581+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "pbTGwscoS2O-TOD7",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 45
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "f8N9xscj",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:06.326587+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "pbTGwscoS2O-TOD7",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 136
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -12838,13 +12774,8 @@
         "data": {
           "event": {
             "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "started"
-              },
-              "tool_call": "",
-              "type": "tool_call"
+              "text": "{\n",
+              "type": "text"
             },
             "event_type": {
               "__enum__": "ChatCompletionResponseEventType",
@@ -12863,13 +12794,8 @@
         "data": {
           "event": {
             "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
-              "type": "tool_call"
+              "text": "    \"type\": \"function\",\n   ",
+              "type": "text"
             },
             "event_type": {
               "__enum__": "ChatCompletionResponseEventType",
@@ -12888,13 +12814,8 @@
         "data": {
           "event": {
             "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "_point\", \"parameters\": {\"liquid_name\":",
-              "type": "tool_call"
+              "text": " \"name\": \"get_boiling_point\",\n",
+              "type": "text"
             },
             "event_type": {
               "__enum__": "ChatCompletionResponseEventType",
@@ -12913,13 +12834,48 @@
         "data": {
           "event": {
             "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": " \"polyjuice\", \"celcius\": \"true\"}}",
-              "type": "tool_call"
+              "text": "    \"parameters\": {\n        \"liquid",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "_name\": \"polyjuice\",\n        \"celcius",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\": \"true\"\n    }\n}",
+              "type": "text"
             },
             "event_type": {
               "__enum__": "ChatCompletionResponseEventType",
@@ -12948,7 +12904,7 @@
                   "celcius": "true",
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "78adc0b9-cd6a-4052-b434-1db332fac11f",
+                "call_id": "ee7ca410-7953-407c-a479-09067389fa5c",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -12989,59 +12945,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "4ZGPgl-J",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:55.006558+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "0JdU31UqRW6uyUfy",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 43
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "4ZGPgl-J",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:55.006570+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "0JdU31UqRW6uyUfy",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 10
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "4ZGPgl-J",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:55.006572+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "0JdU31UqRW6uyUfy",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 53
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -13095,27 +12999,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " boiling point of polyjuice is -100",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "\u00b0C.",
+              "text": " boiling point of polyjuice is -100\u00b0C.",
               "type": "text"
             },
             "event_type": {
@@ -13150,59 +13034,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "TRGdCKiq",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:38.684993+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "yO1YOhixQ9mpO4rb",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 85
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "TRGdCKiq",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:38.685019+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "yO1YOhixQ9mpO4rb",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 22
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "TRGdCKiq",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:38.685025+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "yO1YOhixQ9mpO4rb",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 107
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -13291,59 +13123,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "lHrhiQgT",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:39.714686+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "0jyTQ_JVTyO8Fz_O",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 87
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "lHrhiQgT",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:39.714720+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "0jyTQ_JVTyO8Fz_O",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 22
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "lHrhiQgT",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:39.714727+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "0jyTQ_JVTyO8Fz_O",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 109
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -13407,7 +13187,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point",
+              "tool_call": "{\"type\": \"function\", \"",
               "type": "tool_call"
             },
             "event_type": {
@@ -13432,7 +13212,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\", \"parameters\": {\"liquid_name\": \"polyjuice\", \"cel",
+              "tool_call": "name\": \"get_boiling_point\", \"parameters",
               "type": "tool_call"
             },
             "event_type": {
@@ -13457,7 +13237,32 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "cius\": \"true\"}}",
+              "tool_call": "\": {\"liquid_name\": \"polyjuice\", \"celcius\": \"true",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -13487,7 +13292,7 @@
                   "celcius": "true",
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "ec5e1671-d607-46ae-804b-4f15e42e51b2",
+                "call_id": "f8adc867-71c3-472a-9f2b-95cd34c9f174",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -13528,59 +13333,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "GbmO2wcg",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:38.172673+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "Fquzg9P5RfSrqSeH",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 37
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "GbmO2wcg",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:38.172704+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "Fquzg9P5RfSrqSeH",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 10
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "GbmO2wcg",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:38.172712+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "Fquzg9P5RfSrqSeH",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 47
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -13644,7 +13397,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name",
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point_with",
               "type": "tool_call"
             },
             "event_type": {
@@ -13669,7 +13422,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\": \"get_boiling_point_with_metadata\", \"parameters\": {\"",
+              "tool_call": "_metadata\", \"parameters\": {\"liquid_name\": \"polyjuice\", \"",
               "type": "tool_call"
             },
             "event_type": {
@@ -13694,32 +13447,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "liquid_name\": \"polyjuice\", \"celcius\":",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": " \"true\"}}",
+              "tool_call": "celcius\": \"true\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -13749,7 +13477,7 @@
                   "celcius": "true",
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "1f6ad98b-871e-43fd-a866-53f54acb9466",
+                "call_id": "df18472c-42eb-4ded-8e84-e0b79159219a",
                 "tool_name": "get_boiling_point_with_metadata"
               },
               "type": "tool_call"
@@ -13790,59 +13518,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "gn-gDCYG",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:39.300170+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "U3gRmVfKQK6UkwCL",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 37
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "gn-gDCYG",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:39.300210+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "U3gRmVfKQK6UkwCL",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 10
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "gn-gDCYG",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:39.300222+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "U3gRmVfKQK6UkwCL",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 47
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -13931,59 +13607,416 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "V_N39zVn",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:05.597771+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "S-YEXTxAQyqX6Sbg",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 30
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\nprint(df.head())\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
             },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "V_N39zVn",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:05.597811+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "S-YEXTxAQyqX6Sbg",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 24
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
             },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "V_N39zVn",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:05.597818+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "S-YEXTxAQyqX6Sbg",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 54
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " error message indicates that the `bwrap.core` module is",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " not found. This is because the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " `bwrap.core` module is not a standard Python module",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " and is not installed by default.\n\nTo",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " fix this issue, you can use",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the `pathlib` module to access the file directly. Here",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'s an updated code snippet:\n\n```python\nimport pandas",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " as pd\nfrom pathlib import Path\n\nfile_path",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " = Path(\"/var/folders/cz/vyh7y",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "1d11xg881lsxsshnc5c0000gn",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "/T/tmpeipex0j0",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "/b807hgTQinflation.csv\")\ndf = pd.read_csv",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "(file_path)\nprint(df.head())\n```\n\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "This code uses the `Path` class from the `pathlib",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "` module to create a path object for the file. The `",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "read_csv` method is then used to read the CSV file into",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " a pandas DataFrame.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
             }
-          ]
+          },
+          "metrics": null
         }
       }
     ],
@@ -14813,7 +14846,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "import pandas as pd\nimport code_interpreter\n\n# Load the",
+              "tool_call": "import pandas as pd\ndf = pd.read_csv(\"/var/f",
               "type": "tool_call"
             },
             "event_type": {
@@ -14838,7 +14871,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " CSV file\ndf = pd.read_csv(\"/var/folders/cz",
+              "tool_call": "olders/cz/vyh7y1d11xg881",
               "type": "tool_call"
             },
             "event_type": {
@@ -14863,7 +14896,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "/vyh7y1d11xg881",
+              "tool_call": "lsxsshnc5c0000gn/T/tmpeip",
               "type": "tool_call"
             },
             "event_type": {
@@ -14888,7 +14921,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "lsxsshnc5c0000gn/T/tmplr",
+              "tool_call": "ex0j0/b807hgTQinflation.csv\")\n",
               "type": "tool_call"
             },
             "event_type": {
@@ -14913,107 +14946,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "_wf0lb/Pl4Pewubinflation.csv\")\n\n",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "# Print the first few rows of the dataframe\nprint(df.head",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "())\n\n# Print the data types of each column\nprint(df.d",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "types)\n\n# Print the summary statistics of the",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": " dataframe\nprint(df.describe())",
+              "tool_call": "print(df.head())",
               "type": "tool_call"
             },
             "event_type": {
@@ -15040,9 +14973,9 @@
               },
               "tool_call": {
                 "arguments": {
-                  "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmplr_wf0lb/Pl4Pewubinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())"
+                  "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpeipex0j0/b807hgTQinflation.csv\")\nprint(df.head())"
                 },
-                "call_id": "0a037488-ab9e-46e9-bdc4-7ee6f9ef0e1e",
+                "call_id": "d431c3a2-5b91-4407-8323-27bc134503e0",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -15087,59 +15020,729 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "NoDjls_F",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:17.910457+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "qchwuhR3TlCRLUu5",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 37
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If the file is in your current directory, you can use the following code:\\n\\n```python\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n# Print the first 5 rows of the dataframe\\nprint(df.head())\\n# Print the summary of the dataframe\\nprint(df.info())\\nprint(df.describe())\\n```\\n\\nThis will print the first 5 rows of the dataframe, the summary of the dataframe (including the index dtype and column count), and the description of the dataframe (including count, mean, std, min, 25%, 50%, 75%, max for each column).\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert 'date' column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot the time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
             },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "NoDjls_F",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:17.910513+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "qchwuhR3TlCRLUu5",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 10
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
             },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "NoDjls_F",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:17.910522+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "qchwuhR3TlCRLUu5",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 47
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "This",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " code will create a line plot of",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the average yearly inflation over time. The x",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "-axis represents the year and the y-axis represents the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " average inflation. Each point on the plot represents",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the average inflation for a particular year.\n\nPlease note that you",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " need to replace 'inflation.csv'",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " with the actual path to your csv file. Also,",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " this code assumes that the 'date' column in your csv",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " file is in a format that can be parsed by pandas' `to",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "_datetime` function. If the date is in a different",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " format, you may need to specify the format using the `format",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "` parameter of `to_datetime`.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
             }
-          ]
+          },
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If the file is in your current directory, you can use the following code:\\n\\n```python\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n# Print the first 5 rows of the dataframe\\nprint(df.head())\\n# Print the summary of the dataframe\\nprint(df.info())\\nprint(df.describe())\\n```\\n\\nThis will print the first 5 rows of the dataframe, the summary of the dataframe (including the index dtype and column count), and the description of the dataframe (including count, mean, std, min, 25%, 50%, 75%, max for each column).\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " data\ndf = pd.read_csv('inflation.csv",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "')\n\n# Convert 'date' column to datetime\ndf['date']",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " = pd.to_datetime(df['date'])\n\n# Group by year and calculate",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " average inflation\naverage_inflation = df.groupby(df['date'].dt",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ".year)['inflation'].mean()\n\n# Plot",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " the time series\nplt.figure(figsize=(",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "10,6))\nplt.plot(average",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "_inflation.index, average_inflation.values",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ", marker='o')\nplt.title('Average Yearly Inflation')\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "plt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "(True)\nplt.show()",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv('inflation.csv')\n\n# Convert 'date' column to datetime\ndf['date'] = pd.to_datetime(df['date'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\n\n# Plot the time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()"
+                },
+                "call_id": "ae9d3d8c-ece8-4f94-aa92-a6a93b08b43e",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "code_interpreter"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
         }
       }
     ],
@@ -16099,7 +16702,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " seems that the file \"/var/folders",
+              "text": " seems that the file \"/var/folders/cz/vyh7y1",
               "type": "text"
             },
             "event_type": {
@@ -16119,7 +16722,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "/cz/vyh7y1d11xg881lsx",
+              "text": "d11xg881lsxsshnc5c0000gn/T/t",
               "type": "text"
             },
             "event_type": {
@@ -16139,7 +16742,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "sshnc5c0000gn/T/t",
+              "text": "mpr3640a7b/Y5UaJew2inflation",
               "type": "text"
             },
             "event_type": {
@@ -16159,7 +16762,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "mplr_wf0lb/p99E",
+              "text": ".csv\" does not exist. \n\nTo describe the csv file, you need",
               "type": "text"
             },
             "event_type": {
@@ -16179,7 +16782,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "7wY2inflation.csv\" does not exist. \n\n",
+              "text": " to provide the actual file path or the file itself. If the file is",
               "type": "text"
             },
             "event_type": {
@@ -16199,7 +16802,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "To describe the csv file, you need to provide the actual file",
+              "text": " in your current directory, you can use the following code:\n\n```python\n",
               "type": "text"
             },
             "event_type": {
@@ -16219,7 +16822,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " path or the file itself. If you are using a local file",
+              "text": "import pandas as pd\n# Load data\n",
               "type": "text"
             },
             "event_type": {
@@ -16239,7 +16842,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": ", you can use the `load_data` function from the `",
+              "text": "df = pd.read_csv('inflation.csv')\n# Print",
               "type": "text"
             },
             "event_type": {
@@ -16259,7 +16862,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "code_interpreter` library to load the",
+              "text": " the first 5 rows of the dataframe\nprint(df.head())\n# Print the",
               "type": "text"
             },
             "event_type": {
@@ -16279,7 +16882,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " file. \n\nHere is an example of how you can describe",
+              "text": " summary of the dataframe\nprint(df.info())\nprint(df.describe())\n```\n\n",
               "type": "text"
             },
             "event_type": {
@@ -16299,7 +16902,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " the csv file:\n\n```\nimport pandas as",
+              "text": "This will print the first 5 rows of the dataframe, the summary of",
               "type": "text"
             },
             "event_type": {
@@ -16319,7 +16922,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " pd\nfrom code_interpreter import load_data\n\n# Load data",
+              "text": " the dataframe (including the index dtype and column count), and the description of",
               "type": "text"
             },
             "event_type": {
@@ -16339,7 +16942,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "\ndf = load_data('inflation.csv')\n\n# Print summary of",
+              "text": " the dataframe (including count, mean, std, min, 25%,",
               "type": "text"
             },
             "event_type": {
@@ -16359,187 +16962,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " the data\nprint(df.head())  #",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " Print the first few rows of the data\nprint(df.info())",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "  # Print information about the data\nprint(df.describe()) ",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " # Print summary statistics about the data\n",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "```\n\nPlease replace 'inflation.csv",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "' with your actual csv file name.",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " \n\nIf you are using a remote file, you need to provide",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " the actual file path or the file itself.",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " \n\nAlso, make sure that the file is in the correct format",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " and that the pandas library can read it correctly.",
+              "text": " 50%, 75%, max for each column).",
               "type": "text"
             },
             "event_type": {
@@ -16574,59 +16997,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "rE7rhw1s",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:30.946947+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "RPZJ19J7SzaX6t6h",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 213
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "rE7rhw1s",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:30.946979+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "RPZJ19J7SzaX6t6h",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 261
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "rE7rhw1s",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:30.946982+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "RPZJ19J7SzaX6t6h",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 474
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -16690,7 +17061,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "import pandas as pd\n# Load data\ndf = pd.read",
+              "tool_call": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/",
               "type": "tool_call"
             },
             "event_type": {
@@ -16715,7 +17086,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "_csv(\"/var/folders/cz/vyh7y1d",
+              "tool_call": "var/folders/cz/vyh7y1d11xg881",
               "type": "tool_call"
             },
             "event_type": {
@@ -16740,7 +17111,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "11xg881lsxsshnc5c0000gn/T",
+              "tool_call": "lsxsshnc5c0000gn",
               "type": "tool_call"
             },
             "event_type": {
@@ -16765,7 +17136,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "/tmplr_wf0lb/p99E7wY2",
+              "tool_call": "/T/tmpr3640a7b",
               "type": "tool_call"
             },
             "event_type": {
@@ -16790,7 +17161,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "inflation.csv\")\n#",
+              "tool_call": "/Y5UaJew2",
               "type": "tool_call"
             },
             "event_type": {
@@ -16815,7 +17186,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " Rows\nprint(\"Number of rows and columns in the",
+              "tool_call": "inflation.csv\")\n# Rows\nprint(\"",
               "type": "tool_call"
             },
             "event_type": {
@@ -16840,7 +17211,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " data:\", df.shape)\n# Columns\nprint(\"Columns of",
+              "tool_call": "Number of rows and columns in the",
               "type": "tool_call"
             },
             "event_type": {
@@ -16865,7 +17236,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " the data are:\", len(df.columns))\n",
+              "tool_call": " data:\", df.shape)\n# Columns\nprint",
               "type": "tool_call"
             },
             "event_type": {
@@ -16890,7 +17261,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "# Column names\nprint(\"Columns of",
+              "tool_call": "(\"Columns of the data are:\", len",
               "type": "tool_call"
             },
             "event_type": {
@@ -16915,7 +17286,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " the data are:\", df.columns)\n# Column dtypes\n",
+              "tool_call": "(df.columns))\n# Column names\nprint(\"",
               "type": "tool_call"
             },
             "event_type": {
@@ -16940,7 +17311,57 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "print(\"Datatype of the columns are:\", df.dtypes)",
+              "tool_call": "Columns of the data are:\", df.columns)\n# Column dtypes\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "print(\"Datatype of the columns are:\",",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " df.dtypes)",
               "type": "tool_call"
             },
             "event_type": {
@@ -16967,9 +17388,9 @@
               },
               "tool_call": {
                 "arguments": {
-                  "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmplr_wf0lb/p99E7wY2inflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)"
+                  "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpr3640a7b/Y5UaJew2inflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)"
                 },
-                "call_id": "1db58db0-92c5-4e65-8e83-631bef020ef4",
+                "call_id": "c18dbae3-9ce0-4914-8062-20a3987959e4",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -17014,59 +17435,689 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "W_qnYIUI",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:29.106322+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "RPZJ19J7SzaX6t6h",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 36
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:2a4c4\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:d4e29\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:d68cc\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:d4e29\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:d68cc\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:d4e29\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:d4e29\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:d4e29\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:d4e29\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:d4e29\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
             },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "W_qnYIUI",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:29.106333+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "RPZJ19J7SzaX6t6h",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 10
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
             },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "W_qnYIUI",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:29.106336+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "RPZJ19J7SzaX6t6h",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 46
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "To",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use LoRA in Torchtune, you can follow these steps:\n\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "1.  Install Torchtune and its dependencies.\n2. ",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Download the Llama2 weights and tokenizer.\n3.  Use the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " `lora_llama2_7b",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "` model in Torchtune, which applies LoRA to the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Q and V projections by default.\n4.  Load the base model",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " weights into the LoRA model without any conversion necessary.\n5. ",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Set only LoRA parameters to trainable.\n6.  Run the Lo",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "RA finetuning recipe in Torchtune with the desired configuration.\n\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "You can also experiment with different LoRA",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " configurations, such as applying LoRA to all linear layers in the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " self-attention, increasing the rank, or scaling alpha",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " and rank together.\n\nNote that LoRA can be beneficial for reducing memory usage",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " during fine-tuning, but it may also impact model performance. You",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " can trade off memory and model performance by adjusting the LoRA configuration and",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " running experiments with different settings.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
             }
-          ]
+          },
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:2a4c4\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:d4e29\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:d68cc\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:d4e29\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:d68cc\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\":",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " \"knowledge_search\", \"parameters\": {\"query\": \"How",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " to use LoRA in Torchtune\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "How to use LoRA in Torchtune"
+                },
+                "call_id": "6070c836-0c9c-4f87-ba52-d9bf9ed44195",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:2a4c4\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:d4e29\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:d68cc\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:d4e29\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:d68cc\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'m ready to help you answer questions about Tor",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "chtune based on the documentation you provided",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ". What's your first question?",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
         }
       }
     ],
@@ -18828,6 +19879,708 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:ea3f6\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:5c435\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:91d52\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:5c435\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:91d52\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:5c435\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:5c435\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:5c435\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:5c435\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:5c435\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "To",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use LoRA in Torchtune, you can follow these steps",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ":\n\n1.  Install Torchtune and",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " its dependencies.\n2.  Download the Llama2 weights",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " and tokenizer.\n3.  Use the `lora_ll",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "ama2_7b`",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " model in Torcht",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "une, which applies LoRA to the Q and",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " V projections by default.\n4.  Set the `l",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "ora_attn_modules` argument to apply",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " LoRA to all linear layers in",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the self-attention.\n5.  Increase the `l",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "ora_rank` and `lora_alpha` arguments to improve model",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " performance.\n6.  Run the LoRA finetuning recipe",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " in Torchtune using the `lora_finetune",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "_distributed` command.\n\nBy following",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " these steps, you can apply Lo",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "RA to your Llama2 model and fine-tune it",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " using Torchtune.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:ea3f6\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:5c435\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:91d52\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:5c435\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:91d52\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "knowledge_search\", \"parameters\": {\"query",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\": \"How to use LoRA in Torchtune\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "How to use LoRA in Torchtune"
+                },
+                "call_id": "3f9aaa8a-ca61-4a51-830a-e9920d3d8ec5",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:ea3f6\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:5c435\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:91d52\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:5c435\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:91d52\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'m ready to help you answer questions about Torchtune based on the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " documentation you provided. What's your first question?",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
     "chunks": [
       {
@@ -18911,7 +20664,32 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\": {\"query\": \"Torchtune documentation\"}}",
+              "tool_call": "\": {\"query\": \"Torchtune documentation",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -18940,7 +20718,7 @@
                 "arguments": {
                   "query": "Torchtune documentation"
                 },
-                "call_id": "26bf5efc-c1da-4229-86d9-853f45d3a0f6",
+                "call_id": "5c14ec34-3e33-4d90-b376-5086fed1c306",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -18981,226 +20759,13 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "UUPCfOjW",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:06.661392+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "edTwKHK5Q4K8yCqt",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 39
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "UUPCfOjW",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:06.661422+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "edTwKHK5Q4K8yCqt",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 10
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "UUPCfOjW",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:06.663497+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "edTwKHK5Q4K8yCqt",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 49
-            }
-          ]
+          "metrics": null
         }
       }
     ],
     "type": "generator"
   },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent:  instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\\n\\n\\n.. code-block:: bash\\n\\n    tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights <llama3_label>`\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n    <s>[INST] <<SYS>>\\n    You are a helpful, respectful, and honest assistant.\\n    <</SYS>>\\n\\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\\n\\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n    The Llama3 Base model uses a `different prompt template\\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n    Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n    sample = [\\n        {\\n            \\\"role\\\": \\\"system\\\",\\n            \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
-    "chunks": [
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "start"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "The",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " attention type used",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " by Llama3-8B is grouped-query attention.",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "complete"
-            },
-            "logprobs": null,
-            "stop_reason": {
-              "__enum__": "StopReason",
-              "__module__": "llama_stack.models.llama.datatypes",
-              "value": "end_of_turn"
-            }
-          },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "qzbGsIc-",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:56.822860+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "5LMJTs_wRBiwAPaF",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 80
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "qzbGsIc-",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:56.822890+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "5LMJTs_wRBiwAPaF",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 26
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "qzbGsIc-",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:56.822897+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "5LMJTs_wRBiwAPaF",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 106
-            }
-          ]
-        }
-      }
-    ],
-    "type": "generator"
-  },
-  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent:  instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\\n\\n\\n.. code-block:: bash\\n\\n    tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights <llama3_label>`\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n    <s>[INST] <<SYS>>\\n    You are a helpful, respectful, and honest assistant.\\n    <</SYS>>\\n\\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\\n\\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n    The Llama3 Base model uses a `different prompt template\\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n    Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n    sample = [\\n        {\\n            \\\"role\\\": \\\"system\\\",\\n            \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
     "chunks": [
       {
         "__module__": "llama_stack.apis.inference.inference",
@@ -19303,59 +20868,116 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "WbLMJeWt",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:43.468600+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "ISGpsBHRTjG_DfWw",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 80
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent:  3 <https://llama.meta.com/llama3>`_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks <https://huggingface.co/meta-llama/Meta-Llama-3-8B#base-pretrained-models>`_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent:  instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.\\n\\n\\n.. code-block:: bash\\n\\n    tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights <llama3_label>`\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2>`_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n    <s>[INST] <<SYS>>\\n    You are a helpful, respectful, and honest assistant.\\n    <</SYS>>\\n\\n    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>\\n\\nLlama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n    The Llama3 Base model uses a `different prompt template\\n    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct\\n    because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n    are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n    template for optimal performance. Generally, for instruct and chat data, we recommend using\\n    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n    Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n    sample = [\\n        {\\n            \\\"role\\\": \\\"system\\\",\\n            \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
             },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "WbLMJeWt",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:43.468641+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "ISGpsBHRTjG_DfWw",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 26
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
             },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "WbLMJeWt",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:43.468649+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "ISGpsBHRTjG_DfWw",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 106
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " attention type used by Llama3-8B",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " is grouped-query attention.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
             }
-          ]
+          },
+          "metrics": null
         }
       }
     ],
@@ -19409,7 +21031,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "    \"type\": \"function\",\n    \"name\": \"knowledge_search",
+              "text": "    \"type\": \"function\",\n    \"name\": \"knowledge_search\",\n",
               "type": "text"
             },
             "event_type": {
@@ -19429,7 +21051,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "\",\n    \"parameters\": {\n        \"query\": \"Llama3-",
+              "text": "    \"parameters\": {\n        \"",
               "type": "text"
             },
             "event_type": {
@@ -19449,7 +21071,27 @@
         "data": {
           "event": {
             "delta": {
-              "text": "8B attention type\"\n    }\n}",
+              "text": "query\": \"Llama3-8B attention type\"\n    }\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "}",
               "type": "text"
             },
             "event_type": {
@@ -19478,7 +21120,7 @@
                 "arguments": {
                   "query": "Llama3-8B attention type"
                 },
-                "call_id": "50f2c13d-14c1-417e-bc85-89e23afab120",
+                "call_id": "caa1f5c4-6de8-4999-a22c-97ea4750d4aa",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -19519,59 +21161,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "5I5ujhpm",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:45.629100+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "5LMJTs_wRBiwAPaF",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 40
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "5I5ujhpm",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:45.629127+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "5LMJTs_wRBiwAPaF",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 48
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "5I5ujhpm",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:45.629133+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "5LMJTs_wRBiwAPaF",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 88
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -19635,7 +21225,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\":",
+              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
               "type": "tool_call"
             },
             "event_type": {
@@ -19660,32 +21250,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " \"knowledge_search\", \"parameters\": {\"query\": \"Llama",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "3-8B attention type\"}}",
+              "tool_call": "\": {\"query\": \"Llama3-8B attention type\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -19714,7 +21279,7 @@
                 "arguments": {
                   "query": "Llama3-8B attention type"
                 },
-                "call_id": "70b24279-f0ed-49cc-ab4f-9bd3d7af9554",
+                "call_id": "3aab4108-2ae3-4d71-a27d-7beb09330752",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -19755,59 +21320,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "9GrKkBwq",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:39.870328+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "ISGpsBHRTjG_DfWw",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 40
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "9GrKkBwq",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:39.870341+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "ISGpsBHRTjG_DfWw",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 10
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "9GrKkBwq",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:39.870347+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "ISGpsBHRTjG_DfWw",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 50
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -19954,6 +21467,155 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Forbes\\\", \\\"url\\\": \\\"https://www.forbes.com/profile/mark-zuckerberg/\\\", \\\"content\\\": \\\"Meta has donated $1 million to President-elect Donald Trump's inaugural fund, the company confirmed to various news outlets on Wednesday, a move that comes just weeks after its CEO Mark\\\", \\\"score\\\": 0.6701125, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\\\", \\\"score\\\": 0.6175132, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"META | Meta Platforms Inc. Company Profile & Executives - WSJ\\\", \\\"url\\\": \\\"https://www.wsj.com/market-data/quotes/META/company-people\\\", \\\"content\\\": \\\"Company profile for Meta Platforms Inc. including key executives, insider trading, ownership, revenue and average growth rates. View detailed META description & address.\\\", \\\"score\\\": 0.23361932, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05564338, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " current CEO of Meta is not explicitly stated in",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the search results. However, Mark Zuckerberg is mentioned as the CEO",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " of Meta in some of the search results, but it is not clear",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " if he is still the current CEO.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
     "chunks": [
       {
@@ -20041,7 +21703,7 @@
                 "arguments": {
                   "query": "current CEO of Meta"
                 },
-                "call_id": "f84788f5-ef46-4e13-aa57-3ea4ecb223c1",
+                "call_id": "8e303404-99c1-4610-9e53-82440614bf51",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -20086,59 +21748,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "tWTHAFOr",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:17.453332+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "K0psyd28TdSkb8LK",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 34
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "tWTHAFOr",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:17.453359+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "K0psyd28TdSkb8LK",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 10
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "tWTHAFOr",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:17.453365+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "K0psyd28TdSkb8LK",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 44
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -20232,27 +21842,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": ". The function is only able",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " to find the boiling point of real liquids.",
+              "text": ". The function is only able to find the boiling point of real liquids.",
               "type": "text"
             },
             "event_type": {
@@ -20287,246 +21877,13 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "ZFinp6U7",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:30.079245+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "mUx8OGhtSEW1DSOB",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 70
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "ZFinp6U7",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:30.079279+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "mUx8OGhtSEW1DSOB",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 56
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "ZFinp6U7",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:30.079284+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "mUx8OGhtSEW1DSOB",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 126
-            }
-          ]
+          "metrics": null
         }
       }
     ],
     "type": "generator"
   },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
-    "chunks": [
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "start"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "The",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " function `get_boiling_point` is not",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " able to find the boiling point of poly",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "juice as it is not a real liquid.",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "complete"
-            },
-            "logprobs": null,
-            "stop_reason": {
-              "__enum__": "StopReason",
-              "__module__": "llama_stack.models.llama.datatypes",
-              "value": "end_of_turn"
-            }
-          },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "JtmG7Qaq",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:53.738043+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "g2nkdPGEQ_KS9-qQ",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 70
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "JtmG7Qaq",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:53.738072+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "g2nkdPGEQ_KS9-qQ",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 38
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "JtmG7Qaq",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:53.738079+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "g2nkdPGEQ_KS9-qQ",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 108
-            }
-          ]
-        }
-      }
-    ],
-    "type": "generator"
-  },
-  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
         "__module__": "llama_stack.apis.inference.inference",
@@ -20594,7 +21951,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " boiling point of polyjuice as it is not",
+              "text": " boiling point of polyjuice as it is not a real liquid",
               "type": "text"
             },
             "event_type": {
@@ -20614,7 +21971,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " a real liquid.",
+              "text": ".",
               "type": "text"
             },
             "event_type": {
@@ -20649,59 +22006,136 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "hyoRl-YH",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:15.559044+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "pHT6bhi3THO6qYi9",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 70
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
             },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "hyoRl-YH",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:15.559075+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "pHT6bhi3THO6qYi9",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 38
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
             },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "hyoRl-YH",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:15.559082+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "pHT6bhi3THO6qYi9",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 108
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " function `get_boiling_point` is not",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " able to find the boiling point of polyjuice as it is",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " not a real liquid.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
             }
-          ]
+          },
+          "metrics": null
         }
       }
     ],
@@ -20790,7 +22224,32 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " \"parameters\": {\"liquid_name\": \"polyjuice\"}}",
+              "tool_call": " \"parameters\": {\"liquid_name\": \"polyjuice",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -20819,7 +22278,7 @@
                 "arguments": {
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "ae161bf4-6f03-4830-8f08-3999d20c066a",
+                "call_id": "3d4300a8-2093-458d-8195-3530acaea9e6",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -20860,59 +22319,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "HLJCauvN",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:28.686660+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "3uSIGGP2TcatIhQ7",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 30
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "HLJCauvN",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:28.686691+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "3uSIGGP2TcatIhQ7",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 10
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "HLJCauvN",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:28.686695+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "3uSIGGP2TcatIhQ7",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 40
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -20976,7 +22383,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"",
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
               "type": "tool_call"
             },
             "event_type": {
@@ -21001,32 +22408,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "get_boiling_point\", \"parameters\": {\"liquid_name\": \"poly",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "juice\"}}",
+              "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"polyjuice\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -21055,7 +22437,7 @@
                 "arguments": {
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "c8369271-9c41-4787-b5a7-0280822f3732",
+                "call_id": "da92286f-5b46-45e6-a2ae-a224279323c7",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -21096,59 +22478,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "Ta9THPS8",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:52.569263+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "W6rZ8mwBRRu661Ox",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 30
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "Ta9THPS8",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:52.569291+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "W6rZ8mwBRRu661Ox",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 10
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "Ta9THPS8",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:52.569297+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "W6rZ8mwBRRu661Ox",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 40
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -21202,7 +22532,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " couldn't find any information on the boiling point of Poly",
+              "text": " couldn't find any information on the boiling point of Polyjuice. Polyju",
               "type": "text"
             },
             "event_type": {
@@ -21222,7 +22552,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "juice. Polyjuice is a magical potion in the",
+              "text": "ice is a magical potion in the Harry Potter series that allows the drinker to",
               "type": "text"
             },
             "event_type": {
@@ -21242,7 +22572,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " Harry Potter series that allows the drinker",
+              "text": " transform into someone else. It's not a physical substance with a boiling point.",
               "type": "text"
             },
             "event_type": {
@@ -21262,7 +22592,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " to transform into someone else. It",
+              "text": " If you have any other questions, I'd",
               "type": "text"
             },
             "event_type": {
@@ -21282,47 +22612,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "'s not a physical substance with a boiling point.",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " If you have any other questions, I",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "'d be happy to help.",
+              "text": " be happy to help.",
               "type": "text"
             },
             "event_type": {
@@ -21357,59 +22647,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "FRDVTn1V",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:17.228586+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "3GXhBV5vSn2cf6Pi",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 30
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "FRDVTn1V",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:17.228639+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "3GXhBV5vSn2cf6Pi",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 73
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "FRDVTn1V",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:17.228647+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "3GXhBV5vSn2cf6Pi",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 103
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -21473,7 +22711,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point\",",
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
               "type": "tool_call"
             },
             "event_type": {
@@ -21498,7 +22736,32 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " \"parameters\": {\"liquid_name\": \"polyjuice\"}}",
+              "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "polyjuice\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -21527,7 +22790,7 @@
                 "arguments": {
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "63bb757c-e433-4e14-b527-6989b7ae6582",
+                "call_id": "afbebcb6-ec6b-4e08-99d5-4f92dc68d840",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -21568,59 +22831,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "j1OaNojM",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:09.337637+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "ZAeUlaWpRVSas5hb",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 30
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "j1OaNojM",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:09.337664+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "ZAeUlaWpRVSas5hb",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 10
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "j1OaNojM",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:48:09.337668+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "ZAeUlaWpRVSas5hb",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 40
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -21709,59 +22920,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "uwED-DA9",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:27.524949+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "04_0VtRzTY-hrOyG",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 251
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "uwED-DA9",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:27.524984+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "04_0VtRzTY-hrOyG",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 20
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "uwED-DA9",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:27.524991+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "04_0VtRzTY-hrOyG",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 271
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -21875,7 +23034,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "2 == 0 or n % 3 == 0:\n        return False",
+              "tool_call": "2 == 0 or n % 3 ==",
               "type": "tool_call"
             },
             "event_type": {
@@ -21900,7 +23059,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\n    i = 5\n    while i * i <= n:\n       ",
+              "tool_call": " 0:\n        return False\n    i = 5\n   ",
               "type": "tool_call"
             },
             "event_type": {
@@ -21925,7 +23084,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " if n % i == 0 or n % (i + 2)",
+              "tool_call": " while i * i <= n:\n        if",
               "type": "tool_call"
             },
             "event_type": {
@@ -21950,7 +23109,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " == 0:\n            return False",
+              "tool_call": " n % i == 0 or n % (i + ",
               "type": "tool_call"
             },
             "event_type": {
@@ -21975,7 +23134,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\n",
+              "tool_call": "2) == 0:\n            return False\n        i",
               "type": "tool_call"
             },
             "event_type": {
@@ -22000,7 +23159,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "        i += 6\n    return True\n\ndef get_nth_prime(n):\n    count = 0\n    num = 2\n    while True:\n        if is_prime(num):\n            count += 1\n            if count",
+              "tool_call": " += 6\n    return True\n\ndef get_nth_prime(n):\n    count =",
               "type": "tool_call"
             },
             "event_type": {
@@ -22025,7 +23184,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " == n:\n                return num\n        num",
+              "tool_call": " 0\n    num = 2\n   ",
               "type": "tool_call"
             },
             "event_type": {
@@ -22050,7 +23209,57 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " += 1\n\nprint(get_nth_prime(100))",
+              "tool_call": " while True:\n        if is_prime(num):\n            count +=",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " 1\n            if count == n:\n                return num\n        num +=",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " 1\n\nprint(get_nth_prime(100))",
               "type": "tool_call"
             },
             "event_type": {
@@ -22079,7 +23288,7 @@
                 "arguments": {
                   "code": "def is_prime(n):\n    if n <= 1:\n        return False\n    if n <= 3:\n        return True\n    if n % 2 == 0 or n % 3 == 0:\n        return False\n    i = 5\n    while i * i <= n:\n        if n % i == 0 or n % (i + 2) == 0:\n            return False\n        i += 6\n    return True\n\ndef get_nth_prime(n):\n    count = 0\n    num = 2\n    while True:\n        if is_prime(num):\n            count += 1\n            if count == n:\n                return num\n        num += 1\n\nprint(get_nth_prime(100))"
                 },
-                "call_id": "297a9d9d-daaf-4d90-9496-2648a659aa27",
+                "call_id": "1d9ced32-c0fa-467b-9299-a4f38cf06926",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -22124,59 +23333,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "LfE6srhj",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:26.949350+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "04_0VtRzTY-hrOyG",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 40
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "LfE6srhj",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:26.949380+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "04_0VtRzTY-hrOyG",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 10
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "LfE6srhj",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:47:26.949386+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "04_0VtRzTY-hrOyG",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 50
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -22265,59 +23422,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "25plHusk",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:33.915838+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "CuKMEU31Q26a42-5",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 105
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "25plHusk",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:33.915878+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "CuKMEU31Q26a42-5",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 22
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "25plHusk",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:33.915886+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "CuKMEU31Q26a42-5",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 127
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -22371,7 +23476,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "type\": \"function\", \"name",
+              "text": "type\": \"function\", \"name\": \"knowledge_search\", \"",
               "type": "text"
             },
             "event_type": {
@@ -22391,47 +23496,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "\": \"knowledge_search\", \"parameters\":",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " {\"query\": \"Perplexity",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " company founding date\"}}",
+              "text": "parameters\": {\"query\": \"Perplexity company founding date\"}}",
               "type": "text"
             },
             "event_type": {
@@ -22460,7 +23525,7 @@
                 "arguments": {
                   "query": "Perplexity company founding date"
                 },
-                "call_id": "4521686e-4866-48a0-b676-30333fee6f3e",
+                "call_id": "393a2b30-fbe9-44c3-b2b8-4ecdb086785f",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -22501,59 +23566,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "8BkjXIt4",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:33.355430+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "CuKMEU31Q26a42-5",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 67
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "8BkjXIt4",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:33.355462+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "CuKMEU31Q26a42-5",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 37
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "8BkjXIt4",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:33.355469+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "CuKMEU31Q26a42-5",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 104
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -22617,7 +23630,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\":",
+              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge",
               "type": "tool_call"
             },
             "event_type": {
@@ -22642,7 +23655,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " \"knowledge_search\", \"parameters\": {\"",
+              "tool_call": "_search\", \"parameters\": {\"query\": \"Perplexity",
               "type": "tool_call"
             },
             "event_type": {
@@ -22667,7 +23680,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "query\": \"Perplexity company founding date\"}}",
+              "tool_call": " company founding date\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -22696,7 +23709,7 @@
                 "arguments": {
                   "query": "Perplexity company founding date"
                 },
-                "call_id": "56701398-4b26-4359-aef2-438255259953",
+                "call_id": "84505681-7471-4e1d-8779-916703da7dbb",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -22737,59 +23750,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "QTbOWgfM",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:26.519884+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "CuKMEU31Q26a42-5",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 29
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "QTbOWgfM",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:26.519949+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "CuKMEU31Q26a42-5",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 10
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "QTbOWgfM",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:26.519955+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "CuKMEU31Q26a42-5",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 39
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -22843,7 +23804,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " NBA was created on August 3, 1949, with",
+              "text": " NBA was created on August 3, 1949, with the",
               "type": "text"
             },
             "event_type": {
@@ -22863,7 +23824,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " the merger of the Basketball Association of America (BAA) and",
+              "text": " merger of the Basketball Association of America (BAA) and the National",
               "type": "text"
             },
             "event_type": {
@@ -22883,7 +23844,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " the National Basketball League (NBL).",
+              "text": " Basketball League (NBL).",
               "type": "text"
             },
             "event_type": {
@@ -22918,59 +23879,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "W6iEU_Dm",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:37.336705+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "4Y9e6Ll1RgS_fFdF",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 103
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "W6iEU_Dm",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:37.336742+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "4Y9e6Ll1RgS_fFdF",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 45
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "W6iEU_Dm",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:37.336750+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "4Y9e6Ll1RgS_fFdF",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 148
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -23024,7 +23933,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters\":",
+              "text": "type\": \"function\", \"name\":",
               "type": "text"
             },
             "event_type": {
@@ -23044,7 +23953,47 @@
         "data": {
           "event": {
             "delta": {
-              "text": " {\"query\": \"when was the nba created\"}}",
+              "text": " \"knowledge_search\", \"parameters\":",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " {\"query\": \"when was the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " nba created\"}}",
               "type": "text"
             },
             "event_type": {
@@ -23073,7 +24022,7 @@
                 "arguments": {
                   "query": "when was the nba created"
                 },
-                "call_id": "82c81003-40bb-4e28-bfb0-9bae122da716",
+                "call_id": "e8ac462f-e6e7-4ee8-8d18-09e330454890",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -23114,59 +24063,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "WX35-rLp",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:36.663989+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "4Y9e6Ll1RgS_fFdF",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 65
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "WX35-rLp",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:36.664032+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "4Y9e6Ll1RgS_fFdF",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 37
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "WX35-rLp",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:36.664039+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "4Y9e6Ll1RgS_fFdF",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 102
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -23230,7 +24127,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters\":",
+              "tool_call": "{\"type\": \"function\", \"name",
               "type": "tool_call"
             },
             "event_type": {
@@ -23255,7 +24152,32 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " {\"query\": \"when was the nba created\"}}",
+              "tool_call": "\": \"knowledge_search\", \"parameters\": {\"query\": \"when",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " was the nba created\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -23284,7 +24206,7 @@
                 "arguments": {
                   "query": "when was the nba created"
                 },
-                "call_id": "8fcbc41f-3723-46dd-aee4-948caaa2b458",
+                "call_id": "db2abfd7-9fe5-4957-b2b4-84b1f120092b",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -23325,59 +24247,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "vNEXImhz",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:35.213589+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "4Y9e6Ll1RgS_fFdF",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 27
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "vNEXImhz",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:35.213622+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "4Y9e6Ll1RgS_fFdF",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 10
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "vNEXImhz",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:35.213629+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "4Y9e6Ll1RgS_fFdF",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 37
-            }
-          ]
+          "metrics": null
         }
       }
     ],
diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.json b/tests/integration/fixtures/recorded_responses/invoke_tool.json
index 08d5628ed..3e6b6a307 100644
--- a/tests/integration/fixtures/recorded_responses/invoke_tool.json
+++ b/tests/integration/fixtures/recorded_responses/invoke_tool.json
@@ -64,6 +64,19 @@
       }
     }
   },
+  "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\nprint(df.head())\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"code_interpreter\"}]": {
+    "type": "value",
+    "value": {
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+        "error_code": null,
+        "error_message": null,
+        "metadata": null
+      }
+    }
+  },
   "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"code_interpreter\"}]": {
     "type": "value",
     "value": {
@@ -77,6 +90,19 @@
       }
     }
   },
+  "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert 'date' column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot the time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"code_interpreter\"}]": {
+    "type": "value",
+    "value": {
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+        "error_code": null,
+        "error_message": null,
+        "metadata": null
+      }
+    }
+  },
   "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"inflation.csv\\\")\\n\\n# Convert date column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"code_interpreter\"}]": {
     "type": "value",
     "value": {
@@ -115,23 +141,23 @@
             "type": "text"
           },
           {
-            "text": "Result 1:\nDocument_id:1b69d\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+            "text": "Result 1:\nDocument_id:5c435\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
             "type": "text"
           },
           {
-            "text": "Result 2:\nDocument_id:1b69d\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
+            "text": "Result 2:\nDocument_id:5c435\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
             "type": "text"
           },
           {
-            "text": "Result 3:\nDocument_id:1b69d\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+            "text": "Result 3:\nDocument_id:5c435\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
             "type": "text"
           },
           {
-            "text": "Result 4:\nDocument_id:1b69d\nContent:  from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n  # Assuming that base_model already has the pretrained Llama2 weights,\n  # this will directly load them into your LoRA model without any conversion necessary.\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n  # Fetch all params from the model that are associated with LoRA.\n  lora_params = get_adapter_params(lora_model)\n\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n  set_trainable_params(lora_model, lora_params)\n\n  # Print the total number of parameters\n  total_params = sum([p.numel() for p in lora_model.parameters()])\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n  print(\n    f\"\"\"\n    {total_params} total params,\n    {trainable_params}\" trainable params,\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n    \"\"\"\n  )\n\n  6742609920 total params,\n  4194304 trainable params,\n  0.06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
+            "text": "Result 4:\nDocument_id:5c435\nContent:  from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n  # Assuming that base_model already has the pretrained Llama2 weights,\n  # this will directly load them into your LoRA model without any conversion necessary.\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n  # Fetch all params from the model that are associated with LoRA.\n  lora_params = get_adapter_params(lora_model)\n\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n  set_trainable_params(lora_model, lora_params)\n\n  # Print the total number of parameters\n  total_params = sum([p.numel() for p in lora_model.parameters()])\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n  print(\n    f\"\"\"\n    {total_params} total params,\n    {trainable_params}\" trainable params,\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n    \"\"\"\n  )\n\n  6742609920 total params,\n  4194304 trainable params,\n  0.06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
             "type": "text"
           },
           {
-            "text": "Result 5:\nDocument_id:1b69d\nContent: ,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
+            "text": "Result 5:\nDocument_id:5c435\nContent: ,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
             "type": "text"
           },
           {
@@ -143,11 +169,11 @@
         "error_message": null,
         "metadata": {
           "document_ids": [
-            "1b69d5af-63c0-439b-af6b-db5ec865ec3e",
-            "1b69d5af-63c0-439b-af6b-db5ec865ec3e",
-            "1b69d5af-63c0-439b-af6b-db5ec865ec3e",
-            "1b69d5af-63c0-439b-af6b-db5ec865ec3e",
-            "1b69d5af-63c0-439b-af6b-db5ec865ec3e"
+            "5c435311-5dba-4b40-b8c9-9fd37fbd9b29",
+            "5c435311-5dba-4b40-b8c9-9fd37fbd9b29",
+            "5c435311-5dba-4b40-b8c9-9fd37fbd9b29",
+            "5c435311-5dba-4b40-b8c9-9fd37fbd9b29",
+            "5c435311-5dba-4b40-b8c9-9fd37fbd9b29"
           ]
         }
       }
@@ -335,23 +361,23 @@
             "type": "text"
           },
           {
-            "text": "Result 1:\nDocument_id:42933\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
+            "text": "Result 1:\nDocument_id:ea3f6\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
             "type": "text"
           },
           {
-            "text": "Result 2:\nDocument_id:20e5d\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+            "text": "Result 2:\nDocument_id:5c435\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
             "type": "text"
           },
           {
-            "text": "Result 3:\nDocument_id:0cd43\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+            "text": "Result 3:\nDocument_id:91d52\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
             "type": "text"
           },
           {
-            "text": "Result 4:\nDocument_id:20e5d\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+            "text": "Result 4:\nDocument_id:5c435\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
             "type": "text"
           },
           {
-            "text": "Result 5:\nDocument_id:0cd43\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+            "text": "Result 5:\nDocument_id:91d52\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
             "type": "text"
           },
           {
@@ -363,11 +389,11 @@
         "error_message": null,
         "metadata": {
           "document_ids": [
-            "42933068-5743-4fe6-983d-3ca299971cba",
-            "20e5d737-1eef-4529-87bc-9759a59d943e",
-            "0cd436a4-370e-4962-9313-fde7b2079a10",
-            "20e5d737-1eef-4529-87bc-9759a59d943e",
-            "0cd436a4-370e-4962-9313-fde7b2079a10"
+            "ea3f6e4d-9e11-4bd0-8322-6371f7b0de0c",
+            "5c435311-5dba-4b40-b8c9-9fd37fbd9b29",
+            "91d525eb-07dc-4cad-8596-dd0e6bd011f1",
+            "5c435311-5dba-4b40-b8c9-9fd37fbd9b29",
+            "91d525eb-07dc-4cad-8596-dd0e6bd011f1"
           ]
         }
       }
@@ -379,7 +405,7 @@
       "__module__": "llama_stack.apis.tools.tools",
       "__pydantic__": "ToolInvocationResult",
       "data": {
-        "content": "{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\", \"score\": 0.8342047, \"raw_content\": null}, {\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\u2018Boz\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\", \"score\": 0.8190992, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.79099923, \"raw_content\": null}, {\"title\": \"Meet the Executive CSuite Team of Meta (Facebook) [2025]\", \"url\": \"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\", \"content\": \"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\u2019s finance and facilities team to keep track of the company\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\", \"score\": 0.7602419, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.05564338, \"raw_content\": null}]}",
+        "content": "{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Meet the Executive CSuite Team of Meta (Facebook) [2025]\", \"url\": \"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\", \"content\": \"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\u2019s finance and facilities team to keep track of the company\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\", \"score\": 0.7602419, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Forbes\", \"url\": \"https://www.forbes.com/profile/mark-zuckerberg/\", \"content\": \"Meta has donated $1 million to President-elect Donald Trump's inaugural fund, the company confirmed to various news outlets on Wednesday, a move that comes just weeks after its CEO Mark\", \"score\": 0.6701125, \"raw_content\": null}, {\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\", \"score\": 0.6175132, \"raw_content\": null}, {\"title\": \"META | Meta Platforms Inc. Company Profile & Executives - WSJ\", \"url\": \"https://www.wsj.com/market-data/quotes/META/company-people\", \"content\": \"Company profile for Meta Platforms Inc. including key executives, insider trading, ownership, revenue and average growth rates. View detailed META description & address.\", \"score\": 0.23361932, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.05564338, \"raw_content\": null}]}",
         "error_code": null,
         "error_message": null,
         "metadata": null

From bad12ee21fbb53f347d7541f39b78d4b8bc94415 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Fri, 7 Mar 2025 20:14:04 +0100
Subject: [PATCH 12/58] fix: remove ruff N999 (#1388)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

Since we moved the move tests/client-sdk to tests/api in
https://github.com/meta-llama/llama-stack/pull/1376. The N999 rule is
not needed anymore. And furthermore in

https://github.com/meta-llama/llama-stack/commit/abfbaf3c1baa067a7b5feb0866ac8ab565119a3c

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

[//]: # (## Documentation)

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 tests/integration/__init__.py           | 1 -
 tests/integration/agents/__init__.py    | 1 -
 tests/integration/inference/__init__.py | 1 -
 tests/integration/safety/__init__.py    | 1 -
 tests/integration/vector_io/__init__.py | 1 -
 5 files changed, 5 deletions(-)

diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
index ce038c94b..756f351d8 100644
--- a/tests/integration/__init__.py
+++ b/tests/integration/__init__.py
@@ -3,4 +3,3 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-# ruff: noqa: N999
diff --git a/tests/integration/agents/__init__.py b/tests/integration/agents/__init__.py
index ce038c94b..756f351d8 100644
--- a/tests/integration/agents/__init__.py
+++ b/tests/integration/agents/__init__.py
@@ -3,4 +3,3 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-# ruff: noqa: N999
diff --git a/tests/integration/inference/__init__.py b/tests/integration/inference/__init__.py
index ce038c94b..756f351d8 100644
--- a/tests/integration/inference/__init__.py
+++ b/tests/integration/inference/__init__.py
@@ -3,4 +3,3 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-# ruff: noqa: N999
diff --git a/tests/integration/safety/__init__.py b/tests/integration/safety/__init__.py
index ce038c94b..756f351d8 100644
--- a/tests/integration/safety/__init__.py
+++ b/tests/integration/safety/__init__.py
@@ -3,4 +3,3 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-# ruff: noqa: N999
diff --git a/tests/integration/vector_io/__init__.py b/tests/integration/vector_io/__init__.py
index ce038c94b..756f351d8 100644
--- a/tests/integration/vector_io/__init__.py
+++ b/tests/integration/vector_io/__init__.py
@@ -3,4 +3,3 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-# ruff: noqa: N999

From 7cf1e24c4e248c8634f32f847a80101d030cb881 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Fri, 7 Mar 2025 20:34:30 +0100
Subject: [PATCH 13/58] feat(logging): implement category-based logging (#1362)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

This commit introduces a new logging system that allows loggers to be
assigned
a category while retaining the logger name based on the file name. The
log
format includes both the logger name and the category, producing output
like:

```
INFO     2025-03-03 21:44:11,323 llama_stack.distribution.stack:103 [core]: Tool_groups: builtin::websearch served by
         tavily-search
```

Key features include:

- Category-based logging: Loggers can be assigned a category (e.g.,
  "core", "server") when programming. The logger can be loaded like
  this: `logger = get_logger(name=__name__, category="server")`
- Environment variable control: Log levels can be configured
per-category using the
  `LLAMA_STACK_LOGGING` environment variable. For example:
`LLAMA_STACK_LOGGING="server=DEBUG;core=debug"` enables DEBUG level for
the "server"
    and "core" categories.
- `LLAMA_STACK_LOGGING="all=debug"` sets DEBUG level globally for all
categories and
    third-party libraries.

This provides fine-grained control over logging levels while maintaining
a clean and
informative log format.

The formatter uses the rich library which provides nice colors better
stack traces like so:

```
ERROR    2025-03-03 21:49:37,124 asyncio:1758 [uncategorized]: unhandled exception during asyncio.run() shutdown
         task: <Task finished name='Task-16' coro=<handle_signal.<locals>.shutdown() done, defined at
         /Users/leseb/Documents/AI/llama-stack/llama_stack/distribution/server/server.py:146>
         exception=UnboundLocalError("local variable 'loop' referenced before assignment")>
         ╭────────────────────────────────────── Traceback (most recent call last) ───────────────────────────────────────╮
         │ /Users/leseb/Documents/AI/llama-stack/llama_stack/distribution/server/server.py:178 in shutdown                │
         │                                                                                                                │
         │   175 │   │   except asyncio.CancelledError:                                                                   │
         │   176 │   │   │   pass                                                                                         │
         │   177 │   │   finally:                                                                                         │
         │ ❱ 178 │   │   │   loop.stop()                                                                                  │
         │   179 │                                                                                                        │
         │   180 │   loop = asyncio.get_running_loop()                                                                    │
         │   181 │   loop.create_task(shutdown())                                                                         │
         ╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
         UnboundLocalError: local variable 'loop' referenced before assignment
```

Co-authored-by: Ashwin Bharambe <@ashwinb>
Signed-off-by: Sébastien Han <seb@redhat.com>

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan

```
python -m llama_stack.distribution.server.server --yaml-config ./llama_stack/templates/ollama/run.yaml
INFO     2025-03-03 21:55:35,918 __main__:365 [server]: Using config file: llama_stack/templates/ollama/run.yaml
INFO     2025-03-03 21:55:35,925 __main__:378 [server]: Run configuration:
INFO     2025-03-03 21:55:35,928 __main__:380 [server]: apis:
         - agents
```
[//]: # (## Documentation)

---------

Signed-off-by: Sébastien Han <seb@redhat.com>
Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
---
 llama_stack/cli/stack/run.py                  |   4 +-
 llama_stack/distribution/resolver.py          |  18 +-
 llama_stack/distribution/routers/routers.py   | 106 +++++----
 llama_stack/distribution/server/server.py     |  53 +++--
 llama_stack/distribution/stack.py             |  10 +-
 llama_stack/distribution/start_stack.sh       |   5 +-
 llama_stack/log.py                            | 169 +++++++++++++++
 llama_stack/logcat.py                         | 204 ------------------
 .../agents/meta_reference/agent_instance.py   |  24 +--
 .../remote/inference/fireworks/fireworks.py   |   7 +-
 .../remote/inference/ollama/ollama.py         |  13 +-
 .../remote/inference/together/together.py     |   7 +-
 .../utils/inference/litellm_openai_mixin.py   |   7 +-
 .../utils/inference/prompt_adapter.py         |   7 +-
 pyproject.toml                                |   5 +-
 tests/unit/server/test_logcat.py              |  88 --------
 16 files changed, 296 insertions(+), 431 deletions(-)
 create mode 100644 llama_stack/log.py
 delete mode 100644 llama_stack/logcat.py
 delete mode 100644 tests/unit/server/test_logcat.py

diff --git a/llama_stack/cli/stack/run.py b/llama_stack/cli/stack/run.py
index ba2273003..e5686fb10 100644
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@@ -5,15 +5,15 @@
 # the root directory of this source tree.
 
 import argparse
-import logging
 import os
 from pathlib import Path
 
 from llama_stack.cli.subcommand import Subcommand
+from llama_stack.log import get_logger
 
 REPO_ROOT = Path(__file__).parent.parent.parent.parent
 
-logger = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="server")
 
 
 class StackRun(Subcommand):
diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py
index c24df384d..d7ca4414d 100644
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@@ -7,7 +7,6 @@ import importlib
 import inspect
 from typing import Any, Dict, List, Set, Tuple
 
-from llama_stack import logcat
 from llama_stack.apis.agents import Agents
 from llama_stack.apis.benchmarks import Benchmarks
 from llama_stack.apis.datasetio import DatasetIO
@@ -35,6 +34,7 @@ from llama_stack.distribution.datatypes import (
 from llama_stack.distribution.distribution import builtin_automatically_routed_apis
 from llama_stack.distribution.store import DistributionRegistry
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
+from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import (
     Api,
     BenchmarksProtocolPrivate,
@@ -50,6 +50,8 @@ from llama_stack.providers.datatypes import (
     VectorDBsProtocolPrivate,
 )
 
+logger = get_logger(name=__name__, category="core")
+
 
 class InvalidProviderError(Exception):
     pass
@@ -184,7 +186,7 @@ def validate_and_prepare_providers(
         specs = {}
         for provider in providers:
             if not provider.provider_id or provider.provider_id == "__disabled__":
-                logcat.warning("core", f"Provider `{provider.provider_type}` for API `{api}` is disabled")
+                logger.warning(f"Provider `{provider.provider_type}` for API `{api}` is disabled")
                 continue
 
             validate_provider(provider, api, provider_registry)
@@ -206,11 +208,10 @@ def validate_provider(provider: Provider, api: Api, provider_registry: ProviderR
 
     p = provider_registry[api][provider.provider_type]
     if p.deprecation_error:
-        logcat.error("core", p.deprecation_error)
+        logger.error(p.deprecation_error)
         raise InvalidProviderError(p.deprecation_error)
     elif p.deprecation_warning:
-        logcat.warning(
-            "core",
+        logger.warning(
             f"Provider `{provider.provider_type}` for API `{api}` is deprecated and will be removed in a future release: {p.deprecation_warning}",
         )
 
@@ -244,9 +245,10 @@ def sort_providers_by_deps(
         )
     )
 
-    logcat.debug("core", f"Resolved {len(sorted_providers)} providers")
+    logger.debug(f"Resolved {len(sorted_providers)} providers")
     for api_str, provider in sorted_providers:
-        logcat.debug("core", f" {api_str} => {provider.provider_id}")
+        logger.debug(f" {api_str} => {provider.provider_id}")
+        logger.debug("")
     return sorted_providers
 
 
@@ -387,7 +389,7 @@ def check_protocol_compliance(obj: Any, protocol: Any) -> None:
                 obj_params = set(obj_sig.parameters)
                 obj_params.discard("self")
                 if not (proto_params <= obj_params):
-                    logcat.error("core", f"Method {name} incompatible proto: {proto_params} vs. obj: {obj_params}")
+                    logger.error(f"Method {name} incompatible proto: {proto_params} vs. obj: {obj_params}")
                     missing_methods.append((name, "signature_mismatch"))
                 else:
                     # Check if the method is actually implemented in the class
diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py
index f2c70e66f..28df67922 100644
--- a/llama_stack/distribution/routers/routers.py
+++ b/llama_stack/distribution/routers/routers.py
@@ -6,7 +6,6 @@
 
 from typing import Any, AsyncGenerator, Dict, List, Optional
 
-from llama_stack import logcat
 from llama_stack.apis.common.content_types import (
     URL,
     InterleavedContent,
@@ -52,8 +51,11 @@ from llama_stack.apis.tools import (
     ToolRuntime,
 )
 from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
+from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import RoutingTable
 
+logger = get_logger(name=__name__, category="core")
+
 
 class VectorIORouter(VectorIO):
     """Routes to an provider based on the vector db identifier"""
@@ -62,15 +64,15 @@ class VectorIORouter(VectorIO):
         self,
         routing_table: RoutingTable,
     ) -> None:
-        logcat.debug("core", "Initializing VectorIORouter")
+        logger.debug("Initializing VectorIORouter")
         self.routing_table = routing_table
 
     async def initialize(self) -> None:
-        logcat.debug("core", "VectorIORouter.initialize")
+        logger.debug("VectorIORouter.initialize")
         pass
 
     async def shutdown(self) -> None:
-        logcat.debug("core", "VectorIORouter.shutdown")
+        logger.debug("VectorIORouter.shutdown")
         pass
 
     async def register_vector_db(
@@ -81,10 +83,7 @@ class VectorIORouter(VectorIO):
         provider_id: Optional[str] = None,
         provider_vector_db_id: Optional[str] = None,
     ) -> None:
-        logcat.debug(
-            "core",
-            f"VectorIORouter.register_vector_db: {vector_db_id}, {embedding_model}",
-        )
+        logger.debug(f"VectorIORouter.register_vector_db: {vector_db_id}, {embedding_model}")
         await self.routing_table.register_vector_db(
             vector_db_id,
             embedding_model,
@@ -99,8 +98,7 @@ class VectorIORouter(VectorIO):
         chunks: List[Chunk],
         ttl_seconds: Optional[int] = None,
     ) -> None:
-        logcat.debug(
-            "core",
+        logger.debug(
             f"VectorIORouter.insert_chunks: {vector_db_id}, {len(chunks)} chunks, ttl_seconds={ttl_seconds}, chunk_ids={[chunk.metadata['document_id'] for chunk in chunks[:3]]}{' and more...' if len(chunks) > 3 else ''}",
         )
         return await self.routing_table.get_provider_impl(vector_db_id).insert_chunks(vector_db_id, chunks, ttl_seconds)
@@ -111,7 +109,7 @@ class VectorIORouter(VectorIO):
         query: InterleavedContent,
         params: Optional[Dict[str, Any]] = None,
     ) -> QueryChunksResponse:
-        logcat.debug("core", f"VectorIORouter.query_chunks: {vector_db_id}")
+        logger.debug(f"VectorIORouter.query_chunks: {vector_db_id}")
         return await self.routing_table.get_provider_impl(vector_db_id).query_chunks(vector_db_id, query, params)
 
 
@@ -122,15 +120,15 @@ class InferenceRouter(Inference):
         self,
         routing_table: RoutingTable,
     ) -> None:
-        logcat.debug("core", "Initializing InferenceRouter")
+        logger.debug("Initializing InferenceRouter")
         self.routing_table = routing_table
 
     async def initialize(self) -> None:
-        logcat.debug("core", "InferenceRouter.initialize")
+        logger.debug("InferenceRouter.initialize")
         pass
 
     async def shutdown(self) -> None:
-        logcat.debug("core", "InferenceRouter.shutdown")
+        logger.debug("InferenceRouter.shutdown")
         pass
 
     async def register_model(
@@ -141,8 +139,7 @@ class InferenceRouter(Inference):
         metadata: Optional[Dict[str, Any]] = None,
         model_type: Optional[ModelType] = None,
     ) -> None:
-        logcat.debug(
-            "core",
+        logger.debug(
             f"InferenceRouter.register_model: {model_id=} {provider_model_id=} {provider_id=} {metadata=} {model_type=}",
         )
         await self.routing_table.register_model(model_id, provider_model_id, provider_id, metadata, model_type)
@@ -160,8 +157,7 @@ class InferenceRouter(Inference):
         logprobs: Optional[LogProbConfig] = None,
         tool_config: Optional[ToolConfig] = None,
     ) -> AsyncGenerator:
-        logcat.debug(
-            "core",
+        logger.debug(
             f"InferenceRouter.chat_completion: {model_id=}, {stream=}, {messages=}, {tools=}, {tool_config=}, {response_format=}",
         )
         if sampling_params is None:
@@ -226,8 +222,7 @@ class InferenceRouter(Inference):
     ) -> AsyncGenerator:
         if sampling_params is None:
             sampling_params = SamplingParams()
-        logcat.debug(
-            "core",
+        logger.debug(
             f"InferenceRouter.completion: {model_id=}, {stream=}, {content=}, {sampling_params=}, {response_format=}",
         )
         model = await self.routing_table.get_model(model_id)
@@ -257,7 +252,7 @@ class InferenceRouter(Inference):
         output_dimension: Optional[int] = None,
         task_type: Optional[EmbeddingTaskType] = None,
     ) -> EmbeddingsResponse:
-        logcat.debug("core", f"InferenceRouter.embeddings: {model_id}")
+        logger.debug(f"InferenceRouter.embeddings: {model_id}")
         model = await self.routing_table.get_model(model_id)
         if model is None:
             raise ValueError(f"Model '{model_id}' not found")
@@ -277,15 +272,15 @@ class SafetyRouter(Safety):
         self,
         routing_table: RoutingTable,
     ) -> None:
-        logcat.debug("core", "Initializing SafetyRouter")
+        logger.debug("Initializing SafetyRouter")
         self.routing_table = routing_table
 
     async def initialize(self) -> None:
-        logcat.debug("core", "SafetyRouter.initialize")
+        logger.debug("SafetyRouter.initialize")
         pass
 
     async def shutdown(self) -> None:
-        logcat.debug("core", "SafetyRouter.shutdown")
+        logger.debug("SafetyRouter.shutdown")
         pass
 
     async def register_shield(
@@ -295,7 +290,7 @@ class SafetyRouter(Safety):
         provider_id: Optional[str] = None,
         params: Optional[Dict[str, Any]] = None,
     ) -> Shield:
-        logcat.debug("core", f"SafetyRouter.register_shield: {shield_id}")
+        logger.debug(f"SafetyRouter.register_shield: {shield_id}")
         return await self.routing_table.register_shield(shield_id, provider_shield_id, provider_id, params)
 
     async def run_shield(
@@ -304,7 +299,7 @@ class SafetyRouter(Safety):
         messages: List[Message],
         params: Dict[str, Any] = None,
     ) -> RunShieldResponse:
-        logcat.debug("core", f"SafetyRouter.run_shield: {shield_id}")
+        logger.debug(f"SafetyRouter.run_shield: {shield_id}")
         return await self.routing_table.get_provider_impl(shield_id).run_shield(
             shield_id=shield_id,
             messages=messages,
@@ -317,15 +312,15 @@ class DatasetIORouter(DatasetIO):
         self,
         routing_table: RoutingTable,
     ) -> None:
-        logcat.debug("core", "Initializing DatasetIORouter")
+        logger.debug("Initializing DatasetIORouter")
         self.routing_table = routing_table
 
     async def initialize(self) -> None:
-        logcat.debug("core", "DatasetIORouter.initialize")
+        logger.debug("DatasetIORouter.initialize")
         pass
 
     async def shutdown(self) -> None:
-        logcat.debug("core", "DatasetIORouter.shutdown")
+        logger.debug("DatasetIORouter.shutdown")
         pass
 
     async def get_rows_paginated(
@@ -335,8 +330,7 @@ class DatasetIORouter(DatasetIO):
         page_token: Optional[str] = None,
         filter_condition: Optional[str] = None,
     ) -> PaginatedRowsResult:
-        logcat.debug(
-            "core",
+        logger.debug(
             f"DatasetIORouter.get_rows_paginated: {dataset_id}, rows_in_page={rows_in_page}",
         )
         return await self.routing_table.get_provider_impl(dataset_id).get_rows_paginated(
@@ -347,7 +341,7 @@ class DatasetIORouter(DatasetIO):
         )
 
     async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
-        logcat.debug("core", f"DatasetIORouter.append_rows: {dataset_id}, {len(rows)} rows")
+        logger.debug(f"DatasetIORouter.append_rows: {dataset_id}, {len(rows)} rows")
         return await self.routing_table.get_provider_impl(dataset_id).append_rows(
             dataset_id=dataset_id,
             rows=rows,
@@ -359,15 +353,15 @@ class ScoringRouter(Scoring):
         self,
         routing_table: RoutingTable,
     ) -> None:
-        logcat.debug("core", "Initializing ScoringRouter")
+        logger.debug("Initializing ScoringRouter")
         self.routing_table = routing_table
 
     async def initialize(self) -> None:
-        logcat.debug("core", "ScoringRouter.initialize")
+        logger.debug("ScoringRouter.initialize")
         pass
 
     async def shutdown(self) -> None:
-        logcat.debug("core", "ScoringRouter.shutdown")
+        logger.debug("ScoringRouter.shutdown")
         pass
 
     async def score_batch(
@@ -376,7 +370,7 @@ class ScoringRouter(Scoring):
         scoring_functions: Dict[str, Optional[ScoringFnParams]] = None,
         save_results_dataset: bool = False,
     ) -> ScoreBatchResponse:
-        logcat.debug("core", f"ScoringRouter.score_batch: {dataset_id}")
+        logger.debug(f"ScoringRouter.score_batch: {dataset_id}")
         res = {}
         for fn_identifier in scoring_functions.keys():
             score_response = await self.routing_table.get_provider_impl(fn_identifier).score_batch(
@@ -397,10 +391,7 @@ class ScoringRouter(Scoring):
         input_rows: List[Dict[str, Any]],
         scoring_functions: Dict[str, Optional[ScoringFnParams]] = None,
     ) -> ScoreResponse:
-        logcat.debug(
-            "core",
-            f"ScoringRouter.score: {len(input_rows)} rows, {len(scoring_functions)} functions",
-        )
+        logger.debug(f"ScoringRouter.score: {len(input_rows)} rows, {len(scoring_functions)} functions")
         res = {}
         # look up and map each scoring function to its provider impl
         for fn_identifier in scoring_functions.keys():
@@ -418,15 +409,15 @@ class EvalRouter(Eval):
         self,
         routing_table: RoutingTable,
     ) -> None:
-        logcat.debug("core", "Initializing EvalRouter")
+        logger.debug("Initializing EvalRouter")
         self.routing_table = routing_table
 
     async def initialize(self) -> None:
-        logcat.debug("core", "EvalRouter.initialize")
+        logger.debug("EvalRouter.initialize")
         pass
 
     async def shutdown(self) -> None:
-        logcat.debug("core", "EvalRouter.shutdown")
+        logger.debug("EvalRouter.shutdown")
         pass
 
     async def run_eval(
@@ -434,7 +425,7 @@ class EvalRouter(Eval):
         benchmark_id: str,
         benchmark_config: BenchmarkConfig,
     ) -> Job:
-        logcat.debug("core", f"EvalRouter.run_eval: {benchmark_id}")
+        logger.debug(f"EvalRouter.run_eval: {benchmark_id}")
         return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
             benchmark_id=benchmark_id,
             benchmark_config=benchmark_config,
@@ -447,7 +438,7 @@ class EvalRouter(Eval):
         scoring_functions: List[str],
         benchmark_config: BenchmarkConfig,
     ) -> EvaluateResponse:
-        logcat.debug("core", f"EvalRouter.evaluate_rows: {benchmark_id}, {len(input_rows)} rows")
+        logger.debug(f"EvalRouter.evaluate_rows: {benchmark_id}, {len(input_rows)} rows")
         return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows(
             benchmark_id=benchmark_id,
             input_rows=input_rows,
@@ -460,7 +451,7 @@ class EvalRouter(Eval):
         benchmark_id: str,
         job_id: str,
     ) -> Optional[JobStatus]:
-        logcat.debug("core", f"EvalRouter.job_status: {benchmark_id}, {job_id}")
+        logger.debug(f"EvalRouter.job_status: {benchmark_id}, {job_id}")
         return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id)
 
     async def job_cancel(
@@ -468,7 +459,7 @@ class EvalRouter(Eval):
         benchmark_id: str,
         job_id: str,
     ) -> None:
-        logcat.debug("core", f"EvalRouter.job_cancel: {benchmark_id}, {job_id}")
+        logger.debug(f"EvalRouter.job_cancel: {benchmark_id}, {job_id}")
         await self.routing_table.get_provider_impl(benchmark_id).job_cancel(
             benchmark_id,
             job_id,
@@ -479,7 +470,7 @@ class EvalRouter(Eval):
         benchmark_id: str,
         job_id: str,
     ) -> EvaluateResponse:
-        logcat.debug("core", f"EvalRouter.job_result: {benchmark_id}, {job_id}")
+        logger.debug(f"EvalRouter.job_result: {benchmark_id}, {job_id}")
         return await self.routing_table.get_provider_impl(benchmark_id).job_result(
             benchmark_id,
             job_id,
@@ -492,7 +483,7 @@ class ToolRuntimeRouter(ToolRuntime):
             self,
             routing_table: RoutingTable,
         ) -> None:
-            logcat.debug("core", "Initializing ToolRuntimeRouter.RagToolImpl")
+            logger.debug("Initializing ToolRuntimeRouter.RagToolImpl")
             self.routing_table = routing_table
 
         async def query(
@@ -501,7 +492,7 @@ class ToolRuntimeRouter(ToolRuntime):
             vector_db_ids: List[str],
             query_config: Optional[RAGQueryConfig] = None,
         ) -> RAGQueryResult:
-            logcat.debug("core", f"ToolRuntimeRouter.RagToolImpl.query: {vector_db_ids}")
+            logger.debug(f"ToolRuntimeRouter.RagToolImpl.query: {vector_db_ids}")
             return await self.routing_table.get_provider_impl("knowledge_search").query(
                 content, vector_db_ids, query_config
             )
@@ -512,9 +503,8 @@ class ToolRuntimeRouter(ToolRuntime):
             vector_db_id: str,
             chunk_size_in_tokens: int = 512,
         ) -> None:
-            logcat.debug(
-                "core",
-                f"ToolRuntimeRouter.RagToolImpl.insert: {vector_db_id}, {len(documents)} documents, chunk_size={chunk_size_in_tokens}",
+            logger.debug(
+                f"ToolRuntimeRouter.RagToolImpl.insert: {vector_db_id}, {len(documents)} documents, chunk_size={chunk_size_in_tokens}"
             )
             return await self.routing_table.get_provider_impl("insert_into_memory").insert(
                 documents, vector_db_id, chunk_size_in_tokens
@@ -524,7 +514,7 @@ class ToolRuntimeRouter(ToolRuntime):
         self,
         routing_table: RoutingTable,
     ) -> None:
-        logcat.debug("core", "Initializing ToolRuntimeRouter")
+        logger.debug("Initializing ToolRuntimeRouter")
         self.routing_table = routing_table
 
         # HACK ALERT this should be in sync with "get_all_api_endpoints()"
@@ -533,15 +523,15 @@ class ToolRuntimeRouter(ToolRuntime):
             setattr(self, f"rag_tool.{method}", getattr(self.rag_tool, method))
 
     async def initialize(self) -> None:
-        logcat.debug("core", "ToolRuntimeRouter.initialize")
+        logger.debug("ToolRuntimeRouter.initialize")
         pass
 
     async def shutdown(self) -> None:
-        logcat.debug("core", "ToolRuntimeRouter.shutdown")
+        logger.debug("ToolRuntimeRouter.shutdown")
         pass
 
     async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> Any:
-        logcat.debug("core", f"ToolRuntimeRouter.invoke_tool: {tool_name}")
+        logger.debug(f"ToolRuntimeRouter.invoke_tool: {tool_name}")
         return await self.routing_table.get_provider_impl(tool_name).invoke_tool(
             tool_name=tool_name,
             kwargs=kwargs,
@@ -550,5 +540,5 @@ class ToolRuntimeRouter(ToolRuntime):
     async def list_runtime_tools(
         self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
     ) -> List[ToolDef]:
-        logcat.debug("core", f"ToolRuntimeRouter.list_runtime_tools: {tool_group_id}")
+        logger.debug(f"ToolRuntimeRouter.list_runtime_tools: {tool_group_id}")
         return await self.routing_table.get_provider_impl(tool_group_id).list_tools(tool_group_id, mcp_endpoint)
diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index 2fc36e58f..c4ef79a69 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -9,7 +9,6 @@ import asyncio
 import functools
 import inspect
 import json
-import logging
 import os
 import signal
 import sys
@@ -28,7 +27,6 @@ from fastapi.responses import JSONResponse, StreamingResponse
 from pydantic import BaseModel, ValidationError
 from typing_extensions import Annotated
 
-from llama_stack import logcat
 from llama_stack.distribution.datatypes import StackRunConfig
 from llama_stack.distribution.distribution import builtin_automatically_routed_apis
 from llama_stack.distribution.request_headers import set_request_provider_data
@@ -39,6 +37,7 @@ from llama_stack.distribution.stack import (
     replace_env_vars,
     validate_env_pair,
 )
+from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api
 from llama_stack.providers.inline.telemetry.meta_reference.config import TelemetryConfig
 from llama_stack.providers.inline.telemetry.meta_reference.telemetry import (
@@ -54,8 +53,7 @@ from .endpoints import get_all_api_endpoints
 
 REPO_ROOT = Path(__file__).parent.parent.parent.parent
 
-logging.basicConfig(level=logging.INFO, format="%(levelname)s %(asctime)s %(name)s:%(lineno)d: %(message)s")
-logcat.init()
+logger = get_logger(name=__name__, category="server")
 
 
 def warn_with_traceback(message, category, filename, lineno, file=None, line=None):
@@ -142,23 +140,23 @@ def handle_signal(app, signum, _) -> None:
         not block the current execution.
     """
     signame = signal.Signals(signum).name
-    logcat.info("server", f"Received signal {signame} ({signum}). Exiting gracefully...")
+    logger.info(f"Received signal {signame} ({signum}). Exiting gracefully...")
 
     async def shutdown():
         try:
             # Gracefully shut down implementations
             for impl in app.__llama_stack_impls__.values():
                 impl_name = impl.__class__.__name__
-                logcat.info("server", f"Shutting down {impl_name}")
+                logger.info("Shutting down %s", impl_name)
                 try:
                     if hasattr(impl, "shutdown"):
                         await asyncio.wait_for(impl.shutdown(), timeout=5)
                     else:
-                        logcat.warning("server", f"No shutdown method for {impl_name}")
+                        logger.warning("No shutdown method for %s", impl_name)
                 except asyncio.TimeoutError:
-                    logcat.exception("server", f"Shutdown timeout for {impl_name}")
+                    logger.exception("Shutdown timeout for %s ", impl_name, exc_info=True)
                 except Exception as e:
-                    logcat.exception("server", f"Failed to shutdown {impl_name}: {e}")
+                    logger.exception("Failed to shutdown %s: %s", impl_name, {e})
 
             # Gather all running tasks
             loop = asyncio.get_running_loop()
@@ -172,7 +170,7 @@ def handle_signal(app, signum, _) -> None:
             try:
                 await asyncio.wait_for(asyncio.gather(*tasks, return_exceptions=True), timeout=10)
             except asyncio.TimeoutError:
-                logcat.exception("server", "Timeout while waiting for tasks to finish")
+                logger.exception("Timeout while waiting for tasks to finish")
         except asyncio.CancelledError:
             pass
         finally:
@@ -184,9 +182,9 @@ def handle_signal(app, signum, _) -> None:
 
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    logcat.info("server", "Starting up")
+    logger.info("Starting up")
     yield
-    logcat.info("server", "Shutting down")
+    logger.info("Shutting down")
     for impl in app.__llama_stack_impls__.values():
         await impl.shutdown()
 
@@ -209,11 +207,11 @@ async def sse_generator(event_gen):
             yield create_sse_event(item)
             await asyncio.sleep(0.01)
     except asyncio.CancelledError:
-        logcat.info("server", "Generator cancelled")
+        logger.info("Generator cancelled")
         await event_gen.aclose()
     except Exception as e:
-        logcat.exception("server", f"Error in sse_generator: {e}")
-        logcat.exception("server", f"Traceback: {''.join(traceback.format_exception(type(e), e, e.__traceback__))}")
+        logger.exception(f"Error in sse_generator: {e}")
+        logger.exception(f"Traceback: {''.join(traceback.format_exception(type(e), e, e.__traceback__))}")
         yield create_sse_event(
             {
                 "error": {
@@ -235,7 +233,7 @@ def create_dynamic_typed_route(func: Any, method: str, route: str):
                 value = func(**kwargs)
                 return await maybe_await(value)
         except Exception as e:
-            logcat.exception("server", f"Error in {func.__name__}")
+            traceback.print_exception(e)
             raise translate_exception(e) from e
 
     sig = inspect.signature(func)
@@ -314,8 +312,6 @@ class ClientVersionMiddleware:
 
 
 def main():
-    logcat.init()
-
     """Start the LlamaStack server."""
     parser = argparse.ArgumentParser(description="Start the LlamaStack server.")
     parser.add_argument(
@@ -355,10 +351,10 @@ def main():
         for env_pair in args.env:
             try:
                 key, value = validate_env_pair(env_pair)
-                logcat.info("server", f"Setting CLI environment variable {key} => {value}")
+                logger.info(f"Setting CLI environment variable {key} => {value}")
                 os.environ[key] = value
             except ValueError as e:
-                logcat.error("server", f"Error: {str(e)}")
+                logger.error(f"Error: {str(e)}")
                 sys.exit(1)
 
     if args.yaml_config:
@@ -366,12 +362,12 @@ def main():
         config_file = Path(args.yaml_config)
         if not config_file.exists():
             raise ValueError(f"Config file {config_file} does not exist")
-        logcat.info("server", f"Using config file: {config_file}")
+        logger.info(f"Using config file: {config_file}")
     elif args.template:
         config_file = Path(REPO_ROOT) / "llama_stack" / "templates" / args.template / "run.yaml"
         if not config_file.exists():
             raise ValueError(f"Template {args.template} does not exist")
-        logcat.info("server", f"Using template {args.template} config file: {config_file}")
+        logger.info(f"Using template {args.template} config file: {config_file}")
     else:
         raise ValueError("Either --yaml-config or --template must be provided")
 
@@ -379,10 +375,9 @@ def main():
         config = replace_env_vars(yaml.safe_load(fp))
         config = StackRunConfig(**config)
 
-    logcat.info("server", "Run configuration:")
+    logger.info("Run configuration:")
     safe_config = redact_sensitive_fields(config.model_dump())
-    for log_line in yaml.dump(safe_config, indent=2).split("\n"):
-        logcat.info("server", log_line)
+    logger.info(yaml.dump(safe_config, indent=2))
 
     app = FastAPI(lifespan=lifespan)
     app.add_middleware(TracingMiddleware)
@@ -392,7 +387,7 @@ def main():
     try:
         impls = asyncio.run(construct_stack(config))
     except InvalidProviderError as e:
-        logcat.error("server", f"Error: {str(e)}")
+        logger.error(f"Error: {str(e)}")
         sys.exit(1)
 
     if Api.telemetry in impls:
@@ -437,7 +432,7 @@ def main():
                     )
                 )
 
-    logcat.debug("server", f"serving APIs: {apis_to_serve}")
+    logger.debug(f"serving APIs: {apis_to_serve}")
 
     app.exception_handler(RequestValidationError)(global_exception_handler)
     app.exception_handler(Exception)(global_exception_handler)
@@ -464,10 +459,10 @@ def main():
             "ssl_keyfile": keyfile,
             "ssl_certfile": certfile,
         }
-        logcat.info("server", f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}")
+        logger.info(f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}")
 
     listen_host = ["::", "0.0.0.0"] if not args.disable_ipv6 else "0.0.0.0"
-    logcat.info("server", f"Listening on {listen_host}:{port}")
+    logger.info(f"Listening on {listen_host}:{port}")
 
     uvicorn_config = {
         "app": app,
diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py
index de74aa858..2b974739a 100644
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@@ -11,9 +11,7 @@ import tempfile
 from typing import Any, Dict, Optional
 
 import yaml
-from termcolor import colored
 
-from llama_stack import logcat
 from llama_stack.apis.agents import Agents
 from llama_stack.apis.batch_inference import BatchInference
 from llama_stack.apis.benchmarks import Benchmarks
@@ -39,8 +37,11 @@ from llama_stack.distribution.distribution import get_provider_registry
 from llama_stack.distribution.resolver import ProviderRegistry, resolve_impls
 from llama_stack.distribution.store.registry import create_dist_registry
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
+from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api
 
+logger = get_logger(name=__name__, category="core")
+
 
 class LlamaStack(
     VectorDBs,
@@ -101,9 +102,8 @@ async def register_resources(run_config: StackRunConfig, impls: Dict[Api, Any]):
         objects_to_process = response.data if hasattr(response, "data") else response
 
         for obj in objects_to_process:
-            logcat.debug(
-                "core",
-                f"{rsrc.capitalize()}: {colored(obj.identifier, 'white', attrs=['bold'])} served by {colored(obj.provider_id, 'white', attrs=['bold'])}",
+            logger.debug(
+                f"{rsrc.capitalize()}: {obj.identifier} served by {obj.provider_id}",
             )
 
 
diff --git a/llama_stack/distribution/start_stack.sh b/llama_stack/distribution/start_stack.sh
index a769bd66e..cfc078c27 100755
--- a/llama_stack/distribution/start_stack.sh
+++ b/llama_stack/distribution/start_stack.sh
@@ -100,12 +100,15 @@ esac
 
 if [[ "$env_type" == "venv" || "$env_type" == "conda" ]]; then
     set -x
+
     $PYTHON_BINARY -m llama_stack.distribution.server.server \
     --yaml-config "$yaml_config" \
     --port "$port" \
     $env_vars \
     $other_args
 elif [[ "$env_type" == "container" ]]; then
+    set -x
+
     # Check if container command is available
     if ! is_command_available $CONTAINER_BINARY; then
       printf "${RED}Error: ${CONTAINER_BINARY} command not found. Is ${CONTAINER_BINARY} installed and in your PATH?${NC}" >&2
@@ -141,8 +144,6 @@ elif [[ "$env_type" == "container" ]]; then
         version_tag=$(curl -s $URL | jq -r '.info.version')
     fi
 
-    set -x
-
     $CONTAINER_BINARY run $CONTAINER_OPTS -it \
     -p $port:$port \
     $env_vars \
diff --git a/llama_stack/log.py b/llama_stack/log.py
new file mode 100644
index 000000000..11aa1bf7e
--- /dev/null
+++ b/llama_stack/log.py
@@ -0,0 +1,169 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import logging
+import os
+from logging.config import dictConfig
+from typing import Dict
+
+from rich.console import Console
+from rich.logging import RichHandler
+
+# Default log level
+DEFAULT_LOG_LEVEL = logging.INFO
+
+# Predefined categories
+CATEGORIES = [
+    "core",
+    "server",
+    "router",
+    "inference",
+    "agents",
+    "safety",
+    "eval",
+    "tools",
+    "client",
+]
+
+# Initialize category levels with default level
+_category_levels: Dict[str, int] = {category: DEFAULT_LOG_LEVEL for category in CATEGORIES}
+
+
+def parse_environment_config(env_config: str) -> Dict[str, int]:
+    """
+    Parse the LLAMA_STACK_LOGGING environment variable and return a dictionary of category log levels.
+
+    Parameters:
+        env_config (str): The value of the LLAMA_STACK_LOGGING environment variable.
+
+    Returns:
+        Dict[str, int]: A dictionary mapping categories to their log levels.
+    """
+    category_levels = {}
+    for pair in env_config.split(";"):
+        if not pair.strip():
+            continue
+
+        try:
+            category, level = pair.split("=", 1)
+            category = category.strip().lower()
+            level = level.strip().upper()  # Convert to uppercase for logging._nameToLevel
+
+            level_value = logging._nameToLevel.get(level)
+            if level_value is None:
+                logging.warning(
+                    f"Unknown log level '{level}' for category '{category}'. Falling back to default 'INFO'."
+                )
+                continue
+
+            if category == "all":
+                # Apply the log level to all categories and the root logger
+                for cat in CATEGORIES:
+                    category_levels[cat] = level_value
+                # Set the root logger's level to the specified level
+                category_levels["root"] = level_value
+            elif category in CATEGORIES:
+                category_levels[category] = level_value
+                logging.info(f"Setting '{category}' category to level '{level}'.")
+            else:
+                logging.warning(f"Unknown logging category: {category}. No changes made.")
+
+        except ValueError:
+            logging.warning(f"Invalid logging configuration: '{pair}'. Expected format: 'category=level'.")
+
+    return category_levels
+
+
+class CustomRichHandler(RichHandler):
+    def __init__(self, *args, **kwargs):
+        kwargs["console"] = Console(width=120)
+        super().__init__(*args, **kwargs)
+
+
+def setup_logging(category_levels: Dict[str, int]) -> None:
+    """
+    Configure logging based on the provided category log levels.
+
+    Parameters:
+        category_levels (Dict[str, int]): A dictionary mapping categories to their log levels.
+    """
+    log_format = "[dim]%(asctime)s %(name)s:%(lineno)d[/] [yellow dim]%(category)s[/]: %(message)s"
+
+    class CategoryFilter(logging.Filter):
+        """Ensure category is always present in log records."""
+
+        def filter(self, record):
+            if not hasattr(record, "category"):
+                record.category = "uncategorized"  # Default to 'uncategorized' if no category found
+            return True
+
+    # Determine the root logger's level (default to WARNING if not specified)
+    root_level = category_levels.get("root", logging.WARNING)
+
+    logging_config = {
+        "version": 1,
+        "disable_existing_loggers": False,
+        "formatters": {
+            "rich": {
+                "()": logging.Formatter,
+                "format": log_format,
+            }
+        },
+        "handlers": {
+            "console": {
+                "()": CustomRichHandler,  # Use our custom handler class
+                "formatter": "rich",
+                "rich_tracebacks": True,
+                "show_time": False,
+                "show_path": False,
+                "markup": True,
+                "filters": ["category_filter"],
+            }
+        },
+        "filters": {
+            "category_filter": {
+                "()": CategoryFilter,
+            }
+        },
+        "loggers": {
+            category: {
+                "handlers": ["console"],
+                "level": category_levels.get(category, DEFAULT_LOG_LEVEL),
+                "propagate": False,  # Disable propagation to root logger
+            }
+            for category in CATEGORIES
+        },
+        "root": {
+            "handlers": ["console"],
+            "level": root_level,  # Set root logger's level dynamically
+        },
+    }
+    dictConfig(logging_config)
+
+
+def get_logger(name: str, category: str = "uncategorized") -> logging.LoggerAdapter:
+    """
+    Returns a logger with the specified name and category.
+    If no category is provided, defaults to 'uncategorized'.
+
+    Parameters:
+        name (str): The name of the logger (e.g., module or filename).
+        category (str): The category of the logger (default 'uncategorized').
+
+    Returns:
+        logging.LoggerAdapter: Configured logger with category support.
+    """
+    logger = logging.getLogger(name)
+    logger.setLevel(_category_levels.get(category, DEFAULT_LOG_LEVEL))
+    return logging.LoggerAdapter(logger, {"category": category})
+
+
+env_config = os.environ.get("LLAMA_STACK_LOGGING", "")
+if env_config:
+    print(f"Environment variable LLAMA_STACK_LOGGING found: {env_config}")
+    _category_levels.update(parse_environment_config(env_config))
+
+setup_logging(_category_levels)
diff --git a/llama_stack/logcat.py b/llama_stack/logcat.py
deleted file mode 100644
index 0e11cb782..000000000
--- a/llama_stack/logcat.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""
-Category-based logging utility for llama-stack.
-
-This module provides a wrapper over the standard Python logging module that supports
-categorized logging with environment variable control.
-
-Usage:
-    from llama_stack import logcat
-    logcat.info("server", "Starting up...")
-    logcat.debug("inference", "Processing request...")
-
-Environment variable:
-    LLAMA_STACK_LOGGING: Semicolon-separated list of category=level pairs
-    Example: "server=debug;inference=warning"
-"""
-
-import datetime
-import logging
-import os
-from typing import Dict
-
-# ANSI color codes for terminal output
-COLORS = {
-    "RESET": "\033[0m",
-    "DEBUG": "\033[36m",  # Cyan
-    "INFO": "\033[32m",  # Green
-    "WARNING": "\033[33m",  # Yellow
-    "ERROR": "\033[31m",  # Red
-    "CRITICAL": "\033[35m",  # Magenta
-    "DIM": "\033[2m",  # Dimmed text
-    "YELLOW_DIM": "\033[2;33m",  # Dimmed yellow
-}
-
-# Static list of valid categories representing various parts of the Llama Stack
-# server codebase
-CATEGORIES = [
-    "core",
-    "server",
-    "router",
-    "inference",
-    "agents",
-    "safety",
-    "eval",
-    "tools",
-    "client",
-]
-
-_logger = logging.getLogger("llama_stack")
-_logger.propagate = False
-
-_default_level = logging.INFO
-
-# Category-level mapping (can be modified by environment variables)
-_category_levels: Dict[str, int] = {}
-
-
-class TerminalStreamHandler(logging.StreamHandler):
-    def __init__(self, stream=None):
-        super().__init__(stream)
-        self.is_tty = hasattr(self.stream, "isatty") and self.stream.isatty()
-
-    def format(self, record):
-        record.is_tty = self.is_tty
-        return super().format(record)
-
-
-class ColoredFormatter(logging.Formatter):
-    """Custom formatter with colors and fixed-width level names"""
-
-    def format(self, record):
-        levelname = record.levelname
-        # Use only time with milliseconds, not date
-        timestamp = datetime.datetime.now().strftime("%H:%M:%S.%f")[:-3]  # HH:MM:SS.mmm format
-
-        file_info = f"{record.filename}:{record.lineno}"
-
-        # Get category from extra if available
-        category = getattr(record, "category", None)
-        msg = record.getMessage()
-
-        if getattr(record, "is_tty", False):
-            color = COLORS.get(levelname, COLORS["RESET"])
-            if category:
-                category_formatted = f"{COLORS['YELLOW_DIM']}{category}{COLORS['RESET']} "
-                formatted_msg = (
-                    f"{color}{levelname:<7}{COLORS['RESET']} {COLORS['DIM']}{timestamp}{COLORS['RESET']} "
-                    f"{file_info:<20} {category_formatted}{msg}"
-                )
-            else:
-                formatted_msg = (
-                    f"{color}{levelname:<7}{COLORS['RESET']} {COLORS['DIM']}{timestamp}{COLORS['RESET']}] "
-                    f"{file_info:<20} {msg}"
-                )
-        else:
-            if category:
-                formatted_msg = f"{levelname:<7} {timestamp} {file_info:<20} [{category}] {msg}"
-            else:
-                formatted_msg = f"{levelname:<7} {timestamp} {file_info:<20} {msg}"
-
-        return formatted_msg
-
-
-def init(default_level: int = logging.INFO) -> None:
-    global _default_level, _category_levels, _logger
-
-    _default_level = default_level
-
-    _logger.setLevel(logging.DEBUG)
-    _logger.handlers = []  # Clear existing handlers
-
-    # Add our custom handler with the colored formatter
-    handler = TerminalStreamHandler()
-    formatter = ColoredFormatter()
-    handler.setFormatter(formatter)
-    _logger.addHandler(handler)
-
-    for category in CATEGORIES:
-        _category_levels[category] = default_level
-
-    env_config = os.environ.get("LLAMA_STACK_LOGGING", "")
-    if env_config:
-        for pair in env_config.split(";"):
-            if not pair.strip():
-                continue
-
-            try:
-                category, level = pair.split("=", 1)
-                category = category.strip().lower()
-                level = level.strip().lower()
-
-                level_value = {
-                    "debug": logging.DEBUG,
-                    "info": logging.INFO,
-                    "warning": logging.WARNING,
-                    "warn": logging.WARNING,
-                    "error": logging.ERROR,
-                    "critical": logging.CRITICAL,
-                }.get(level)
-
-                if level_value is None:
-                    _logger.warning(f"Unknown log level '{level}' for category '{category}'")
-                    continue
-
-                if category == "all":
-                    for cat in CATEGORIES:
-                        _category_levels[cat] = level_value
-                else:
-                    if category in CATEGORIES:
-                        _category_levels[category] = level_value
-                    else:
-                        _logger.warning(f"Unknown logging category: {category}")
-
-            except ValueError:
-                _logger.warning(f"Invalid logging configuration: {pair}")
-
-
-def _should_log(level: int, category: str) -> bool:
-    category = category.lower()
-    if category not in _category_levels:
-        return False
-    category_level = _category_levels[category]
-    return level >= category_level
-
-
-def _log(level: int, level_name: str, category: str, msg: str, *args, **kwargs) -> None:
-    if _should_log(level, category):
-        kwargs.setdefault("extra", {})["category"] = category.lower()
-        getattr(_logger, level_name)(msg, *args, stacklevel=3, **kwargs)
-
-
-def debug(category: str, msg: str, *args, **kwargs) -> None:
-    _log(logging.DEBUG, "debug", category, msg, *args, **kwargs)
-
-
-def info(category: str, msg: str, *args, **kwargs) -> None:
-    _log(logging.INFO, "info", category, msg, *args, **kwargs)
-
-
-def warning(category: str, msg: str, *args, **kwargs) -> None:
-    _log(logging.WARNING, "warning", category, msg, *args, **kwargs)
-
-
-def warn(category: str, msg: str, *args, **kwargs) -> None:
-    warning(category, msg, *args, **kwargs)
-
-
-def error(category: str, msg: str, *args, **kwargs) -> None:
-    _log(logging.ERROR, "error", category, msg, *args, **kwargs)
-
-
-def critical(category: str, msg: str, *args, **kwargs) -> None:
-    _log(logging.CRITICAL, "critical", category, msg, *args, **kwargs)
-
-
-def exception(category: str, msg: str, *args, **kwargs) -> None:
-    if _should_log(logging.ERROR, category):
-        kwargs.setdefault("extra", {})["category"] = category.lower()
-        _logger.exception(msg, *args, stacklevel=2, **kwargs)
diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
index 720e73503..3619b3f67 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@@ -17,7 +17,6 @@ from urllib.parse import urlparse
 
 import httpx
 
-from llama_stack import logcat
 from llama_stack.apis.agents import (
     AgentConfig,
     AgentToolGroup,
@@ -67,6 +66,7 @@ from llama_stack.apis.tools import (
     ToolRuntime,
 )
 from llama_stack.apis.vector_io import VectorIO
+from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import (
     BuiltinTool,
     ToolCall,
@@ -88,6 +88,8 @@ MEMORY_QUERY_TOOL = "knowledge_search"
 WEB_SEARCH_TOOL = "web_search"
 RAG_TOOL_GROUP = "builtin::rag"
 
+logger = get_logger(name=__name__, category="agents")
+
 
 class ChatAgent(ShieldRunnerMixin):
     def __init__(
@@ -609,7 +611,7 @@ class ChatAgent(ShieldRunnerMixin):
             )
 
             if n_iter >= self.agent_config.max_infer_iters:
-                logcat.info("agents", f"done with MAX iterations ({n_iter}), exiting.")
+                logger.info(f"done with MAX iterations ({n_iter}), exiting.")
                 # NOTE: mark end_of_turn to indicate to client that we are done with the turn
                 # Do not continue the tool call loop after this point
                 message.stop_reason = StopReason.end_of_turn
@@ -617,7 +619,7 @@ class ChatAgent(ShieldRunnerMixin):
                 break
 
             if stop_reason == StopReason.out_of_tokens:
-                logcat.info("agents", "out of token budget, exiting.")
+                logger.info("out of token budget, exiting.")
                 yield message
                 break
 
@@ -631,16 +633,10 @@ class ChatAgent(ShieldRunnerMixin):
                             message.content = [message.content] + output_attachments
                     yield message
                 else:
-                    logcat.debug(
-                        "agents",
-                        f"completion message with EOM (iter: {n_iter}): {str(message)}",
-                    )
+                    logger.debug(f"completion message with EOM (iter: {n_iter}): {str(message)}")
                     input_messages = input_messages + [message]
             else:
-                logcat.debug(
-                    "agents",
-                    f"completion message (iter: {n_iter}) from the model: {str(message)}",
-                )
+                logger.debug(f"completion message (iter: {n_iter}) from the model: {str(message)}")
                 # 1. Start the tool execution step and progress
                 step_id = str(uuid.uuid4())
                 yield AgentTurnResponseStreamChunk(
@@ -983,7 +979,7 @@ async def attachment_message(tempdir: str, urls: List[URL]) -> ToolResponseMessa
             path = urlparse(uri).path
             basename = os.path.basename(path)
             filepath = f"{tempdir}/{make_random_string() + basename}"
-            logcat.info("agents", f"Downloading {url} -> {filepath}")
+            logger.info(f"Downloading {url} -> {filepath}")
 
             async with httpx.AsyncClient() as client:
                 r = await client.get(uri)
@@ -1023,7 +1019,7 @@ async def execute_tool_call_maybe(
         else:
             name = name.value
 
-    logcat.info("agents", f"executing tool call: {name} with args: {tool_call.arguments}")
+    logger.info(f"executing tool call: {name} with args: {tool_call.arguments}")
     result = await tool_runtime_api.invoke_tool(
         tool_name=name,
         kwargs={
@@ -1033,7 +1029,7 @@ async def execute_tool_call_maybe(
             **toolgroup_args.get(group_name, {}),
         },
     )
-    logcat.debug("agents", f"tool call {name} completed with result: {result}")
+    logger.info(f"tool call {name} completed with result: {result}")
     return result
 
 
diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py
index a4cecf9f1..ec68fb556 100644
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@@ -8,7 +8,6 @@ from typing import AsyncGenerator, List, Optional, Union
 
 from fireworks.client import Fireworks
 
-from llama_stack import logcat
 from llama_stack.apis.common.content_types import (
     InterleavedContent,
     InterleavedContentItem,
@@ -33,6 +32,7 @@ from llama_stack.apis.inference import (
     ToolPromptFormat,
 )
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
+from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.model_registry import (
     ModelRegistryHelper,
 )
@@ -55,6 +55,8 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 from .config import FireworksImplConfig
 from .models import MODEL_ENTRIES
 
+logger = get_logger(name=__name__, category="inference")
+
 
 class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProviderData):
     def __init__(self, config: FireworksImplConfig) -> None:
@@ -237,7 +239,8 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
             "stream": request.stream,
             **self._build_options(request.sampling_params, request.response_format, request.logprobs),
         }
-        logcat.debug("inference", f"params to fireworks: {params}")
+        logger.debug(f"params to fireworks: {params}")
+
         return params
 
     async def embeddings(
diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py
index 4d7fef8ed..36941480c 100644
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@@ -4,13 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
+
 from typing import AsyncGenerator, List, Optional, Union
 
 import httpx
 from ollama import AsyncClient
 
-from llama_stack import logcat
 from llama_stack.apis.common.content_types import (
     ImageContentItem,
     InterleavedContent,
@@ -35,6 +34,7 @@ from llama_stack.apis.inference import (
     ToolPromptFormat,
 )
 from llama_stack.apis.models import Model, ModelType
+from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.model_registry import (
     ModelRegistryHelper,
@@ -59,7 +59,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 
 from .models import model_entries
 
-log = logging.getLogger(__name__)
+logger = get_logger(name=__name__, category="inference")
 
 
 class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
@@ -72,7 +72,7 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
         return AsyncClient(host=self.url)
 
     async def initialize(self) -> None:
-        log.info(f"checking connectivity to Ollama at `{self.url}`...")
+        logger.info(f"checking connectivity to Ollama at `{self.url}`...")
         try:
             await self.client.ps()
         except httpx.ConnectError as e:
@@ -214,7 +214,8 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
             "options": sampling_options,
             "stream": request.stream,
         }
-        logcat.debug("inference", f"params to ollama: {params}")
+        logger.debug(f"params to ollama: {params}")
+
         return params
 
     async def _nonstream_chat_completion(self, request: ChatCompletionRequest) -> ChatCompletionResponse:
@@ -290,7 +291,7 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
     async def register_model(self, model: Model) -> Model:
         model = await self.register_helper.register_model(model)
         if model.model_type == ModelType.embedding:
-            log.info(f"Pulling embedding model `{model.provider_resource_id}` if necessary...")
+            logger.info(f"Pulling embedding model `{model.provider_resource_id}` if necessary...")
             await self.client.pull(model.provider_resource_id)
             response = await self.client.list()
         else:
diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py
index 0c468cdbf..f701c0da7 100644
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@@ -8,7 +8,6 @@ from typing import AsyncGenerator, List, Optional, Union
 
 from together import Together
 
-from llama_stack import logcat
 from llama_stack.apis.common.content_types import (
     InterleavedContent,
     InterleavedContentItem,
@@ -32,6 +31,7 @@ from llama_stack.apis.inference import (
     ToolPromptFormat,
 )
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
+from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.model_registry import (
     ModelRegistryHelper,
 )
@@ -54,6 +54,8 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 from .config import TogetherImplConfig
 from .models import MODEL_ENTRIES
 
+logger = get_logger(name=__name__, category="inference")
+
 
 class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProviderData):
     def __init__(self, config: TogetherImplConfig) -> None:
@@ -224,8 +226,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
             "stream": request.stream,
             **self._build_options(request.sampling_params, request.logprobs, request.response_format),
         }
-        logcat.debug("inference", f"params to together: {params}")
-        return params
+        logger.debug(f"params to together: {params}")
 
     async def embeddings(
         self,
diff --git a/llama_stack/providers/utils/inference/litellm_openai_mixin.py b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
index 9467996a6..d88dc5a9e 100644
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@@ -8,7 +8,6 @@ from typing import AsyncGenerator, AsyncIterator, List, Optional, Union
 
 import litellm
 
-from llama_stack import logcat
 from llama_stack.apis.common.content_types import (
     InterleavedContent,
     InterleavedContentItem,
@@ -33,6 +32,7 @@ from llama_stack.apis.inference import (
 )
 from llama_stack.apis.models.models import Model
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
+from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.model_registry import (
     ModelRegistryHelper,
 )
@@ -47,6 +47,8 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
     interleaved_content_as_str,
 )
 
+logger = get_logger(name=__name__, category="inference")
+
 
 class LiteLLMOpenAIMixin(
     ModelRegistryHelper,
@@ -109,8 +111,7 @@ class LiteLLMOpenAIMixin(
         )
 
         params = await self._get_params(request)
-        logcat.debug("inference", f"params to litellm (openai compat): {params}")
-
+        logger.debug(f"params to litellm (openai compat): {params}")
         # unfortunately, we need to use synchronous litellm.completion here because litellm
         # caches various httpx.client objects in a non-eventloop aware manner
         response = litellm.completion(**params)
diff --git a/llama_stack/providers/utils/inference/prompt_adapter.py b/llama_stack/providers/utils/inference/prompt_adapter.py
index 37b1a8160..1edf445c0 100644
--- a/llama_stack/providers/utils/inference/prompt_adapter.py
+++ b/llama_stack/providers/utils/inference/prompt_adapter.py
@@ -8,14 +8,12 @@ import asyncio
 import base64
 import io
 import json
-import logging
 import re
 from typing import List, Optional, Tuple, Union
 
 import httpx
 from PIL import Image as PIL_Image
 
-from llama_stack import logcat
 from llama_stack.apis.common.content_types import (
     ImageContentItem,
     InterleavedContent,
@@ -34,6 +32,7 @@ from llama_stack.apis.inference import (
     ToolDefinition,
     UserMessage,
 )
+from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import (
     ModelFamily,
     RawContent,
@@ -58,7 +57,7 @@ from llama_stack.models.llama.llama3.tokenizer import Tokenizer
 from llama_stack.models.llama.sku_list import resolve_model
 from llama_stack.providers.utils.inference import supported_inference_models
 
-log = logging.getLogger(__name__)
+log = get_logger(name=__name__, category="inference")
 
 
 class ChatCompletionRequestWithRawContent(ChatCompletionRequest):
@@ -464,7 +463,7 @@ def _get_tool_choice_prompt(tool_choice: ToolChoice | str, tools: List[ToolDefin
 def get_default_tool_prompt_format(model: str) -> ToolPromptFormat:
     llama_model = resolve_model(model)
     if llama_model is None:
-        logcat.warning("inference", f"Could not resolve model {model}, defaulting to json tool prompt format")
+        log.warning(f"Could not resolve model {model}, defaulting to json tool prompt format")
         return ToolPromptFormat.json
 
     if llama_model.model_family == ModelFamily.llama3_1 or (
diff --git a/pyproject.toml b/pyproject.toml
index d8f3718d8..0fa055a02 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -151,7 +151,6 @@ exclude = [
     "llama_stack/distribution",
     "llama_stack/apis",
     "llama_stack/cli",
-    "llama_stack/logcat.py",
     "llama_stack/models",
     "llama_stack/strong_typing",
     "llama_stack/templates",
@@ -163,5 +162,5 @@ module = ["yaml", "fire"]
 ignore_missing_imports = true
 
 [[tool.mypy.overrides]]
-module = "llama_stack.distribution.resolver"
-follow_imports = "normal"                    # This will force type checking on this module
+module = ["llama_stack.distribution.resolver", "llama_stack.log"]
+follow_imports = "normal"                                         # This will force type checking on this module
diff --git a/tests/unit/server/test_logcat.py b/tests/unit/server/test_logcat.py
deleted file mode 100644
index 4a116a08f..000000000
--- a/tests/unit/server/test_logcat.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import io
-import logging
-import os
-import unittest
-
-from llama_stack import logcat
-
-
-class TestLogcat(unittest.TestCase):
-    def setUp(self):
-        self.original_env = os.environ.get("LLAMA_STACK_LOGGING")
-
-        self.log_output = io.StringIO()
-        self._init_logcat()
-
-    def tearDown(self):
-        if self.original_env is not None:
-            os.environ["LLAMA_STACK_LOGGING"] = self.original_env
-        else:
-            os.environ.pop("LLAMA_STACK_LOGGING", None)
-
-    def _init_logcat(self):
-        logcat.init(default_level=logging.DEBUG)
-        self.handler = logging.StreamHandler(self.log_output)
-        self.handler.setFormatter(logging.Formatter("[%(category)s] %(message)s"))
-        logcat._logger.handlers.clear()
-        logcat._logger.addHandler(self.handler)
-
-    def test_basic_logging(self):
-        logcat.info("server", "Info message")
-        logcat.warning("server", "Warning message")
-        logcat.error("server", "Error message")
-
-        output = self.log_output.getvalue()
-        self.assertIn("[server] Info message", output)
-        self.assertIn("[server] Warning message", output)
-        self.assertIn("[server] Error message", output)
-
-    def test_different_categories(self):
-        # Log messages with different categories
-        logcat.info("server", "Server message")
-        logcat.info("inference", "Inference message")
-        logcat.info("router", "Router message")
-
-        output = self.log_output.getvalue()
-        self.assertIn("[server] Server message", output)
-        self.assertIn("[inference] Inference message", output)
-        self.assertIn("[router] Router message", output)
-
-    def test_env_var_control(self):
-        os.environ["LLAMA_STACK_LOGGING"] = "server=debug;inference=warning"
-        self._init_logcat()
-
-        # These should be visible based on the environment settings
-        logcat.debug("server", "Server debug message")
-        logcat.info("server", "Server info message")
-        logcat.warning("inference", "Inference warning message")
-        logcat.error("inference", "Inference error message")
-
-        # These should be filtered out based on the environment settings
-        logcat.debug("inference", "Inference debug message")
-        logcat.info("inference", "Inference info message")
-
-        output = self.log_output.getvalue()
-        self.assertIn("[server] Server debug message", output)
-        self.assertIn("[server] Server info message", output)
-        self.assertIn("[inference] Inference warning message", output)
-        self.assertIn("[inference] Inference error message", output)
-
-        self.assertNotIn("[inference] Inference debug message", output)
-        self.assertNotIn("[inference] Inference info message", output)
-
-    def test_invalid_category(self):
-        logcat.info("nonexistent", "This message should not be logged")
-
-        # Check that the message was not logged
-        output = self.log_output.getvalue()
-        self.assertNotIn("[nonexistent] This message should not be logged", output)
-
-
-if __name__ == "__main__":
-    unittest.main()

From ffa32af930af794ff3244a9736a29687800704c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Fri, 7 Mar 2025 20:42:38 +0100
Subject: [PATCH 14/58] build: bump llama-stack-client version (#1469)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What does this PR do?

Use 0.1.5.

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

[//]: # (## Documentation)

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 pyproject.toml   | 2 +-
 requirements.txt | 2 +-
 uv.lock          | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0fa055a02..5519727bc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ dependencies = [
     "httpx",
     "huggingface-hub",
     "jsonschema",
-    "llama-stack-client>=0.1.4",
+    "llama-stack-client>=0.1.5",
     "prompt-toolkit",
     "python-dotenv",
     "pydantic>=2",
diff --git a/requirements.txt b/requirements.txt
index 90f329d4d..1945b08a6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,7 +20,7 @@ huggingface-hub==0.29.0
 idna==3.10
 jsonschema==4.23.0
 jsonschema-specifications==2024.10.1
-llama-stack-client==0.1.4
+llama-stack-client==0.1.5
 lxml==5.3.1
 markdown-it-py==3.0.0
 mdurl==0.1.2
diff --git a/uv.lock b/uv.lock
index e62d9426e..4a1eca676 100644
--- a/uv.lock
+++ b/uv.lock
@@ -945,7 +945,7 @@ requires-dist = [
     { name = "huggingface-hub" },
     { name = "jinja2", marker = "extra == 'codegen'", specifier = ">=3.1.6" },
     { name = "jsonschema" },
-    { name = "llama-stack-client", specifier = ">=0.1.4" },
+    { name = "llama-stack-client", specifier = ">=0.1.5" },
     { name = "lm-format-enforcer", marker = "extra == 'test'", specifier = ">=0.10.9" },
     { name = "myst-parser", marker = "extra == 'docs'" },
     { name = "nbval", marker = "extra == 'dev'" },
@@ -990,7 +990,7 @@ provides-extras = ["dev", "test", "docs", "codegen"]
 
 [[package]]
 name = "llama-stack-client"
-version = "0.1.4"
+version = "0.1.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -1007,9 +1007,9 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/71/6b/0c9900bcefe683b1186c272f372ac643ebd307db9efa95fa2c4418e207b3/llama_stack_client-0.1.4.tar.gz", hash = "sha256:539ff9b8c40272d4f3b023605aff9b70e66958b6bd952a04f9e9a5b2bfde00dd", size = 260958 }
+sdist = { url = "https://files.pythonhosted.org/packages/72/26/24b8dcd97dadee66cf0b9a3cb0ee18c65a92b8732de76c1aec97d85306e2/llama_stack_client-0.1.5.tar.gz", hash = "sha256:f342969920c87d9518298fade6debecb15b7c19899eed241d61253be2bf35053", size = 261420 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/1f/00/56d7699354677e584610d5457baf09b0fde7ca71946532ba0f867d5e47c2/llama_stack_client-0.1.4-py3-none-any.whl", hash = "sha256:5034e7b3aac099a3ad88868b3ba1d2ba19285151ec40776ceda18e500b866a8e", size = 369327 },
+    { url = "https://files.pythonhosted.org/packages/ed/07/329a5220325a3a352967717e8878db1edc9c88616e36e0a1e819571067c0/llama_stack_client-0.1.5-py3-none-any.whl", hash = "sha256:2aeff88b6f836d71fd2c75d087ccc19d881fca769e05636b0ddf7b41a7c4aef8", size = 369754 },
 ]
 
 [[package]]

From 256448c14efbfe7922723ec784710bcb9cc19ad8 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Fri, 7 Mar 2025 11:45:54 -0800
Subject: [PATCH 15/58] fix(cli):  llama model prompt-format (#1481)

Summary:

+ llama model prompt-format -m Llama3.2-11B-Vision-Instruct
Traceback (most recent call last):
  File "/tmp/tmp.gCwyyCcjoA/.venv/bin/llama", line 10, in <module>
    sys.exit(main())
File
"/tmp/tmp.gCwyyCcjoA/.venv/lib/python3.10/site-packages/llama_stack/cli/llama.py",
line 50, in main
    parser.run(args)
File
"/tmp/tmp.gCwyyCcjoA/.venv/lib/python3.10/site-packages/llama_stack/cli/llama.py",
line 44, in run
    args.func(args)
File
"/tmp/tmp.gCwyyCcjoA/.venv/lib/python3.10/site-packages/llama_stack/cli/model/prompt_format.py",
line 59, in _run_model_template_cmd
    if args.list:
AttributeError: 'Namespace' object has no attribute 'list'

Test Plan:
llama model prompt-format -m Llama3.2-11B-Vision-Instruct
---
 llama_stack/cli/model/prompt_format.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/llama_stack/cli/model/prompt_format.py b/llama_stack/cli/model/prompt_format.py
index 0cee94235..8058db461 100644
--- a/llama_stack/cli/model/prompt_format.py
+++ b/llama_stack/cli/model/prompt_format.py
@@ -13,7 +13,7 @@ from llama_stack.cli.subcommand import Subcommand
 from llama_stack.cli.table import print_table
 from llama_stack.models.llama.datatypes import CoreModelId, ModelFamily, is_multimodal, model_family
 
-ROOT_DIR = Path(__file__).parent.parent
+ROOT_DIR = Path(__file__).parent.parent.parent
 
 
 class ModelPromptFormat(Subcommand):
@@ -44,6 +44,12 @@ class ModelPromptFormat(Subcommand):
             default="llama3_1",
             help="Model Family (llama3_1, llama3_X, etc.)",
         )
+        self.parser.add_argument(
+            "-l",
+            "--list",
+            action="store_true",
+            help="List all available models",
+        )
 
     def _run_model_template_cmd(self, args: argparse.Namespace) -> None:
         import importlib.resources

From d86a893ead18e738eb29aee2719f2ae051a747f7 Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Fri, 7 Mar 2025 14:48:00 -0500
Subject: [PATCH 16/58] fix: Swap to AsyncOpenAI client in remote vllm provider
 (#1459)

# What does this PR do?

This switches from an OpenAI client to the AsyncOpenAI client in the
remote vllm provider. The main benefit of this is that instead of each
client call being a blocking operation that was blocking our server
event loop, the client calls are now async operations that do not block
the event loop.

The actual fix is quite simple and straightforward. Creating a reliable
reproducer of this with a unit test that verifies we were blocking the
event loop before and are not blocking it any longer was a bit harder.
Some other inference providers have this same issue, so we may want to
make that simple delayed http server a bit more generic and pull it into
a common place as other inference providers get fixed.

(Closes #1457)

## Test Plan

I verified the unit tests and test_text_inference tests pass with this
change like below:

```
python -m pytest -v tests/unit
```

```
VLLM_URL="http://localhost:8000/v1" \
INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" \
LLAMA_STACK_CONFIG=remote-vllm \
python -m pytest -v -s \
tests/integration/inference/test_text_inference.py \
--text-model "meta-llama/Llama-3.2-3B-Instruct"
```

Signed-off-by: Ben Browning <bbrownin@redhat.com>
---
 .../providers/remote/inference/vllm/vllm.py   |  35 ++----
 .../providers/inference/test_remote_vllm.py   | 101 +++++++++++++++++-
 2 files changed, 107 insertions(+), 29 deletions(-)

diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index ac9a46e85..4d7e66d78 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -7,7 +7,7 @@ import json
 import logging
 from typing import AsyncGenerator, List, Optional, Union
 
-from openai import OpenAI
+from openai import AsyncOpenAI
 from openai.types.chat.chat_completion_chunk import (
     ChatCompletionChunk as OpenAIChatCompletionChunk,
 )
@@ -229,7 +229,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
 
     async def initialize(self) -> None:
         log.info(f"Initializing VLLM client with base_url={self.config.url}")
-        self.client = OpenAI(base_url=self.config.url, api_key=self.config.api_token)
+        self.client = AsyncOpenAI(base_url=self.config.url, api_key=self.config.api_token)
 
     async def shutdown(self) -> None:
         pass
@@ -300,10 +300,10 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
             return await self._nonstream_chat_completion(request, self.client)
 
     async def _nonstream_chat_completion(
-        self, request: ChatCompletionRequest, client: OpenAI
+        self, request: ChatCompletionRequest, client: AsyncOpenAI
     ) -> ChatCompletionResponse:
         params = await self._get_params(request)
-        r = client.chat.completions.create(**params)
+        r = await client.chat.completions.create(**params)
         choice = r.choices[0]
         result = ChatCompletionResponse(
             completion_message=CompletionMessage(
@@ -315,17 +315,10 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
         )
         return result
 
-    async def _stream_chat_completion(self, request: ChatCompletionRequest, client: OpenAI) -> AsyncGenerator:
+    async def _stream_chat_completion(self, request: ChatCompletionRequest, client: AsyncOpenAI) -> AsyncGenerator:
         params = await self._get_params(request)
 
-        # TODO: Can we use client.completions.acreate() or maybe there is another way to directly create an async
-        #  generator so this wrapper is not necessary?
-        async def _to_async_generator():
-            s = client.chat.completions.create(**params)
-            for chunk in s:
-                yield chunk
-
-        stream = _to_async_generator()
+        stream = await client.chat.completions.create(**params)
         if len(request.tools) > 0:
             res = _process_vllm_chat_completion_stream_response(stream)
         else:
@@ -335,26 +328,20 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
 
     async def _nonstream_completion(self, request: CompletionRequest) -> CompletionResponse:
         params = await self._get_params(request)
-        r = self.client.completions.create(**params)
+        r = await self.client.completions.create(**params)
         return process_completion_response(r)
 
     async def _stream_completion(self, request: CompletionRequest) -> AsyncGenerator:
         params = await self._get_params(request)
 
-        # Wrapper for async generator similar
-        async def _to_async_generator():
-            stream = self.client.completions.create(**params)
-            for chunk in stream:
-                yield chunk
-
-        stream = _to_async_generator()
+        stream = await self.client.completions.create(**params)
         async for chunk in process_completion_stream_response(stream):
             yield chunk
 
     async def register_model(self, model: Model) -> Model:
         model = await self.register_helper.register_model(model)
-        res = self.client.models.list()
-        available_models = [m.id for m in res]
+        res = await self.client.models.list()
+        available_models = [m.id async for m in res]
         if model.provider_resource_id not in available_models:
             raise ValueError(
                 f"Model {model.provider_resource_id} is not being served by vLLM. "
@@ -410,7 +397,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
         assert model.metadata.get("embedding_dimension")
         kwargs["dimensions"] = model.metadata.get("embedding_dimension")
         assert all(not content_has_media(content) for content in contents), "VLLM does not support media for embeddings"
-        response = self.client.embeddings.create(
+        response = await self.client.embeddings.create(
             model=model.provider_resource_id,
             input=[interleaved_content_as_str(content) for content in contents],
             **kwargs,
diff --git a/tests/unit/providers/inference/test_remote_vllm.py b/tests/unit/providers/inference/test_remote_vllm.py
index 11b1ba123..3afe1389e 100644
--- a/tests/unit/providers/inference/test_remote_vllm.py
+++ b/tests/unit/providers/inference/test_remote_vllm.py
@@ -4,6 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import asyncio
+import json
+import logging
+import threading
+import time
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from typing import Any, Dict
 from unittest.mock import AsyncMock, patch
 
 import pytest
@@ -39,9 +46,41 @@ from llama_stack.providers.remote.inference.vllm.vllm import (
 # -v -s --tb=short --disable-warnings
 
 
+class MockInferenceAdapterWithSleep:
+    def __init__(self, sleep_time: int, response: Dict[str, Any]):
+        self.httpd = None
+
+        class DelayedRequestHandler(BaseHTTPRequestHandler):
+            # ruff: noqa: N802
+            def do_POST(self):
+                time.sleep(sleep_time)
+                self.send_response(code=200)
+                self.end_headers()
+                self.wfile.write(json.dumps(response).encode("utf-8"))
+
+        self.request_handler = DelayedRequestHandler
+
+    def __enter__(self):
+        httpd = HTTPServer(("", 0), self.request_handler)
+        self.httpd = httpd
+        host, port = httpd.server_address
+        httpd_thread = threading.Thread(target=httpd.serve_forever)
+        httpd_thread.daemon = True  # stop server if this thread terminates
+        httpd_thread.start()
+
+        config = VLLMInferenceAdapterConfig(url=f"http://{host}:{port}")
+        inference_adapter = VLLMInferenceAdapter(config)
+        return inference_adapter
+
+    def __exit__(self, _exc_type, _exc_value, _traceback):
+        if self.httpd:
+            self.httpd.shutdown()
+            self.httpd.server_close()
+
+
 @pytest.fixture(scope="module")
 def mock_openai_models_list():
-    with patch("openai.resources.models.Models.list") as mock_list:
+    with patch("openai.resources.models.AsyncModels.list", new_callable=AsyncMock) as mock_list:
         yield mock_list
 
 
@@ -56,10 +95,10 @@ async def vllm_inference_adapter():
 
 @pytest.mark.asyncio
 async def test_register_model_checks_vllm(mock_openai_models_list, vllm_inference_adapter):
-    mock_openai_models = [
-        OpenAIModel(id="foo", created=1, object="model", owned_by="test"),
-    ]
-    mock_openai_models_list.return_value = mock_openai_models
+    async def mock_openai_models():
+        yield OpenAIModel(id="foo", created=1, object="model", owned_by="test")
+
+    mock_openai_models_list.return_value = mock_openai_models()
 
     foo_model = Model(identifier="foo", provider_resource_id="foo", provider_id="vllm-inference")
 
@@ -141,3 +180,55 @@ async def test_process_vllm_chat_completion_stream_response_no_choices():
 
     chunks = [chunk async for chunk in _process_vllm_chat_completion_stream_response(mock_stream())]
     assert len(chunks) == 0
+
+
+def test_chat_completion_doesnt_block_event_loop(caplog):
+    loop = asyncio.new_event_loop()
+    loop.set_debug(True)
+    caplog.set_level(logging.WARNING)
+
+    # Log when event loop is blocked for more than 100ms
+    loop.slow_callback_duration = 0.1
+    # Sleep for 500ms in our delayed http response
+    sleep_time = 0.5
+
+    mock_model = Model(identifier="mock-model", provider_resource_id="mock-model", provider_id="vllm-inference")
+    mock_response = {
+        "id": "chatcmpl-abc123",
+        "object": "chat.completion",
+        "created": 1,
+        "modle": "mock-model",
+        "choices": [
+            {
+                "message": {"content": ""},
+                "logprobs": None,
+                "finish_reason": "stop",
+                "index": 0,
+            }
+        ],
+    }
+
+    async def do_chat_completion():
+        await inference_adapter.chat_completion(
+            "mock-model",
+            [],
+            stream=False,
+            tools=None,
+            tool_config=ToolConfig(tool_choice=ToolChoice.auto),
+        )
+
+    with MockInferenceAdapterWithSleep(sleep_time, mock_response) as inference_adapter:
+        inference_adapter.model_store = AsyncMock()
+        inference_adapter.model_store.get_model.return_value = mock_model
+        loop.run_until_complete(inference_adapter.initialize())
+
+        # Clear the logs so far and run the actual chat completion we care about
+        caplog.clear()
+        loop.run_until_complete(do_chat_completion())
+
+    # Ensure we don't have any asyncio warnings in the captured log
+    # records from our chat completion call. A message gets logged
+    # here any time we exceed the slow_callback_duration configured
+    # above.
+    asyncio_warnings = [record.message for record in caplog.records if record.name == "asyncio"]
+    assert not asyncio_warnings

From 124e8d7cfe244978e2eac6de192f2134046abb39 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Fri, 7 Mar 2025 12:10:52 -0800
Subject: [PATCH 17/58] build: include .md (#1482)

Summary:

Test Plan:
---
 MANIFEST.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MANIFEST.in b/MANIFEST.in
index b47c2dccb..572a9ac0a 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -5,3 +5,4 @@ include llama_stack/distribution/*.sh
 include llama_stack/cli/scripts/*.sh
 include llama_stack/templates/*/*.yaml
 include llama_stack/providers/tests/test_cases/inference/*.json
+include llama_stack/models/llama/*/*.md

From 125728836190cb960059177db83a6ca413ecdcbe Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Fri, 7 Mar 2025 12:36:02 -0800
Subject: [PATCH 18/58] build: add 'tiktoken' to deps (#1483)

Summary:

Test Plan:
---
 .../templates/open_benchmark/build.yaml       |  37 ++
 llama_stack/templates/open_benchmark/run.yaml | 364 ++++++++++++++++++
 pyproject.toml                                |   3 +-
 requirements.txt                              |   3 +
 uv.lock                                       |   6 +-
 5 files changed, 410 insertions(+), 3 deletions(-)
 create mode 100644 llama_stack/templates/open_benchmark/build.yaml
 create mode 100644 llama_stack/templates/open_benchmark/run.yaml

diff --git a/llama_stack/templates/open_benchmark/build.yaml b/llama_stack/templates/open_benchmark/build.yaml
new file mode 100644
index 000000000..367dd1374
--- /dev/null
+++ b/llama_stack/templates/open_benchmark/build.yaml
@@ -0,0 +1,37 @@
+version: '2'
+distribution_spec:
+  description: Distribution for running open benchmarks
+  providers:
+    inference:
+    - remote::openai
+    - remote::anthropic
+    - remote::gemini
+    - remote::groq
+    - remote::together
+    - inline::sentence-transformers
+    vector_io:
+    - inline::sqlite-vec
+    - remote::chromadb
+    - remote::pgvector
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+    eval:
+    - inline::meta-reference
+    datasetio:
+    - remote::huggingface
+    - inline::localfs
+    scoring:
+    - inline::basic
+    - inline::llm-as-judge
+    - inline::braintrust
+    tool_runtime:
+    - remote::brave-search
+    - remote::tavily-search
+    - inline::code-interpreter
+    - inline::rag-runtime
+    - remote::model-context-protocol
+image_type: conda
diff --git a/llama_stack/templates/open_benchmark/run.yaml b/llama_stack/templates/open_benchmark/run.yaml
new file mode 100644
index 000000000..e98c2c708
--- /dev/null
+++ b/llama_stack/templates/open_benchmark/run.yaml
@@ -0,0 +1,364 @@
+version: '2'
+image_name: open_benchmark
+apis:
+- agents
+- datasetio
+- eval
+- inference
+- safety
+- scoring
+- telemetry
+- tool_runtime
+- vector_io
+providers:
+  inference:
+  - provider_id: openai
+    provider_type: remote::openai
+    config:
+      api_key: ${env.OPENAI_API_KEY:}
+  - provider_id: anthropic
+    provider_type: remote::anthropic
+    config:
+      api_key: ${env.ANTHROPIC_API_KEY:}
+  - provider_id: gemini
+    provider_type: remote::gemini
+    config:
+      api_key: ${env.GEMINI_API_KEY:}
+  - provider_id: groq
+    provider_type: remote::groq
+    config:
+      url: https://api.groq.com
+      api_key: ${env.GROQ_API_KEY:}
+  - provider_id: together
+    provider_type: remote::together
+    config:
+      url: https://api.together.xyz/v1
+      api_key: ${env.TOGETHER_API_KEY}
+  vector_io:
+  - provider_id: sqlite-vec
+    provider_type: inline::sqlite-vec
+    config:
+      db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open_benchmark}/sqlite_vec.db
+  - provider_id: ${env.ENABLE_CHROMADB+chromadb}
+    provider_type: remote::chromadb
+    config:
+      url: ${env.CHROMADB_URL:}
+  - provider_id: ${env.ENABLE_PGVECTOR+pgvector}
+    provider_type: remote::pgvector
+    config:
+      host: ${env.PGVECTOR_HOST:localhost}
+      port: ${env.PGVECTOR_PORT:5432}
+      db: ${env.PGVECTOR_DB:}
+      user: ${env.PGVECTOR_USER:}
+      password: ${env.PGVECTOR_PASSWORD:}
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open_benchmark}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
+      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
+      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/open_benchmark/trace_store.db}
+  eval:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config: {}
+  datasetio:
+  - provider_id: huggingface
+    provider_type: remote::huggingface
+    config: {}
+  - provider_id: localfs
+    provider_type: inline::localfs
+    config: {}
+  scoring:
+  - provider_id: basic
+    provider_type: inline::basic
+    config: {}
+  - provider_id: llm-as-judge
+    provider_type: inline::llm-as-judge
+    config: {}
+  - provider_id: braintrust
+    provider_type: inline::braintrust
+    config:
+      openai_api_key: ${env.OPENAI_API_KEY:}
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: ${env.BRAVE_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: tavily-search
+    provider_type: remote::tavily-search
+    config:
+      api_key: ${env.TAVILY_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: code-interpreter
+    provider_type: inline::code-interpreter
+    config: {}
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+    config: {}
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+    config: {}
+metadata_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open_benchmark}/registry.db
+models:
+- metadata: {}
+  model_id: openai/gpt-4o
+  provider_id: openai
+  provider_model_id: openai/gpt-4o
+  model_type: llm
+- metadata: {}
+  model_id: openai/gpt-4o-mini
+  provider_id: openai
+  provider_model_id: openai/gpt-4o-mini
+  model_type: llm
+- metadata: {}
+  model_id: openai/chatgpt-4o-latest
+  provider_id: openai
+  provider_model_id: openai/chatgpt-4o-latest
+  model_type: llm
+- metadata:
+    embedding_dimension: 1536
+    context_length: 8192
+  model_id: openai/text-embedding-3-small
+  provider_id: openai
+  provider_model_id: openai/text-embedding-3-small
+  model_type: embedding
+- metadata:
+    embedding_dimension: 3072
+    context_length: 8192
+  model_id: openai/text-embedding-3-large
+  provider_id: openai
+  provider_model_id: openai/text-embedding-3-large
+  model_type: embedding
+- metadata: {}
+  model_id: anthropic/claude-3-5-sonnet-latest
+  provider_id: anthropic
+  provider_model_id: anthropic/claude-3-5-sonnet-latest
+  model_type: llm
+- metadata: {}
+  model_id: anthropic/claude-3-7-sonnet-latest
+  provider_id: anthropic
+  provider_model_id: anthropic/claude-3-7-sonnet-latest
+  model_type: llm
+- metadata: {}
+  model_id: anthropic/claude-3-5-haiku-latest
+  provider_id: anthropic
+  provider_model_id: anthropic/claude-3-5-haiku-latest
+  model_type: llm
+- metadata:
+    embedding_dimension: 1024
+    context_length: 32000
+  model_id: anthropic/voyage-3
+  provider_id: anthropic
+  provider_model_id: anthropic/voyage-3
+  model_type: embedding
+- metadata:
+    embedding_dimension: 512
+    context_length: 32000
+  model_id: anthropic/voyage-3-lite
+  provider_id: anthropic
+  provider_model_id: anthropic/voyage-3-lite
+  model_type: embedding
+- metadata:
+    embedding_dimension: 1024
+    context_length: 32000
+  model_id: anthropic/voyage-code-3
+  provider_id: anthropic
+  provider_model_id: anthropic/voyage-code-3
+  model_type: embedding
+- metadata: {}
+  model_id: gemini/gemini-1.5-flash
+  provider_id: gemini
+  provider_model_id: gemini/gemini-1.5-flash
+  model_type: llm
+- metadata: {}
+  model_id: gemini/gemini-1.5-pro
+  provider_id: gemini
+  provider_model_id: gemini/gemini-1.5-pro
+  model_type: llm
+- metadata:
+    embedding_dimension: 768
+    context_length: 2048
+  model_id: gemini/text-embedding-004
+  provider_id: gemini
+  provider_model_id: gemini/text-embedding-004
+  model_type: embedding
+- metadata: {}
+  model_id: groq/llama3-8b-8192
+  provider_id: groq
+  provider_model_id: groq/llama3-8b-8192
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.1-8B-Instruct
+  provider_id: groq
+  provider_model_id: groq/llama3-8b-8192
+  model_type: llm
+- metadata: {}
+  model_id: groq/llama-3.1-8b-instant
+  provider_id: groq
+  provider_model_id: groq/llama-3.1-8b-instant
+  model_type: llm
+- metadata: {}
+  model_id: groq/llama3-70b-8192
+  provider_id: groq
+  provider_model_id: groq/llama3-70b-8192
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3-70B-Instruct
+  provider_id: groq
+  provider_model_id: groq/llama3-70b-8192
+  model_type: llm
+- metadata: {}
+  model_id: groq/llama-3.3-70b-versatile
+  provider_id: groq
+  provider_model_id: groq/llama-3.3-70b-versatile
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.3-70B-Instruct
+  provider_id: groq
+  provider_model_id: groq/llama-3.3-70b-versatile
+  model_type: llm
+- metadata: {}
+  model_id: groq/llama-3.2-3b-preview
+  provider_id: groq
+  provider_model_id: groq/llama-3.2-3b-preview
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-3B-Instruct
+  provider_id: groq
+  provider_model_id: groq/llama-3.2-3b-preview
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo
+  provider_id: together
+  provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.1-8B-Instruct
+  provider_id: together
+  provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo
+  provider_id: together
+  provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.1-70B-Instruct
+  provider_id: together
+  provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
+  provider_id: together
+  provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
+  provider_id: together
+  provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo
+  provider_id: together
+  provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-3B-Instruct
+  provider_id: together
+  provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo
+  provider_id: together
+  provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
+  provider_id: together
+  provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
+  provider_id: together
+  provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
+  provider_id: together
+  provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo
+  provider_id: together
+  provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.3-70B-Instruct
+  provider_id: together
+  provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Meta-Llama-Guard-3-8B
+  provider_id: together
+  provider_model_id: meta-llama/Meta-Llama-Guard-3-8B
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-Guard-3-8B
+  provider_id: together
+  provider_model_id: meta-llama/Meta-Llama-Guard-3-8B
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo
+  provider_id: together
+  provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-Guard-3-11B-Vision
+  provider_id: together
+  provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo
+  model_type: llm
+- metadata:
+    embedding_dimension: 768
+    context_length: 8192
+  model_id: togethercomputer/m2-bert-80M-8k-retrieval
+  provider_id: together
+  provider_model_id: togethercomputer/m2-bert-80M-8k-retrieval
+  model_type: embedding
+- metadata:
+    embedding_dimension: 768
+    context_length: 32768
+  model_id: togethercomputer/m2-bert-80M-32k-retrieval
+  provider_id: together
+  provider_model_id: togethercomputer/m2-bert-80M-32k-retrieval
+  model_type: embedding
+shields:
+- shield_id: meta-llama/Llama-Guard-3-8B
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::websearch
+  provider_id: tavily-search
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
+- toolgroup_id: builtin::code_interpreter
+  provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/pyproject.toml b/pyproject.toml
index 5519727bc..fb3065ced 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,8 @@ dependencies = [
     "rich",
     "setuptools",
     "termcolor",
+    "tiktoken",
+    "pillow",
 ]
 
 [project.optional-dependencies]
@@ -63,7 +65,6 @@ test = [
     "groq",
     "opentelemetry-sdk",
     "opentelemetry-exporter-otlp-proto-http",
-    "tiktoken",
     "chardet",
     "pypdf",
 ]
diff --git a/requirements.txt b/requirements.txt
index 1945b08a6..d2e2e7a29 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -27,6 +27,7 @@ mdurl==0.1.2
 numpy==2.2.3
 packaging==24.2
 pandas==2.2.3
+pillow==11.1.0
 prompt-toolkit==3.0.50
 pyaml==25.1.0
 pycryptodomex==3.21.0
@@ -38,6 +39,7 @@ python-dotenv==1.0.1
 pytz==2025.1
 pyyaml==6.0.2
 referencing==0.36.2
+regex==2024.11.6
 requests==2.32.3
 rich==13.9.4
 rpds-py==0.22.3
@@ -45,6 +47,7 @@ setuptools==75.8.0
 six==1.17.0
 sniffio==1.3.1
 termcolor==2.5.0
+tiktoken==0.9.0
 tqdm==4.67.1
 typing-extensions==4.12.2
 tzdata==2025.1
diff --git a/uv.lock b/uv.lock
index 4a1eca676..09ad0815e 100644
--- a/uv.lock
+++ b/uv.lock
@@ -871,6 +871,7 @@ dependencies = [
     { name = "huggingface-hub" },
     { name = "jsonschema" },
     { name = "llama-stack-client" },
+    { name = "pillow" },
     { name = "prompt-toolkit" },
     { name = "pydantic" },
     { name = "python-dotenv" },
@@ -878,6 +879,7 @@ dependencies = [
     { name = "rich" },
     { name = "setuptools" },
     { name = "termcolor" },
+    { name = "tiktoken" },
 ]
 
 [package.optional-dependencies]
@@ -924,7 +926,6 @@ test = [
     { name = "opentelemetry-sdk" },
     { name = "pypdf" },
     { name = "sqlite-vec" },
-    { name = "tiktoken" },
     { name = "torch", version = "2.6.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" },
     { name = "torch", version = "2.6.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin'" },
     { name = "torchvision", version = "0.21.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
@@ -953,6 +954,7 @@ requires-dist = [
     { name = "openai", marker = "extra == 'test'" },
     { name = "opentelemetry-exporter-otlp-proto-http", marker = "extra == 'test'" },
     { name = "opentelemetry-sdk", marker = "extra == 'test'" },
+    { name = "pillow" },
     { name = "pre-commit", marker = "extra == 'dev'" },
     { name = "prompt-toolkit" },
     { name = "pydantic", specifier = ">=2" },
@@ -978,7 +980,7 @@ requires-dist = [
     { name = "sphinxcontrib-video", marker = "extra == 'docs'" },
     { name = "sqlite-vec", marker = "extra == 'test'" },
     { name = "termcolor" },
-    { name = "tiktoken", marker = "extra == 'test'" },
+    { name = "tiktoken" },
     { name = "tomli", marker = "extra == 'docs'" },
     { name = "torch", marker = "extra == 'test'", specifier = ">=2.6.0", index = "https://download.pytorch.org/whl/cpu" },
     { name = "torchvision", marker = "extra == 'test'", specifier = ">=0.21.0", index = "https://download.pytorch.org/whl/cpu" },

From 5a2b9e121c2ea014c8c987f78cf83967a1b1c8f8 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Fri, 7 Mar 2025 12:52:26 -0800
Subject: [PATCH 19/58] fix: return result for together's get_params (#1484)

# What does this PR do?

- return results for together's get_params
- fix issue
<img width="1538" alt="image"
src="https://github.com/user-attachments/assets/c4cd3802-85ef-4ff3-b2fd-76737be2e4ff"
/>

- the `return params` was accidentally deleted in
https://github.com/meta-llama/llama-stack/pull/1362/files#diff-d9345410ea64589cee96487b22eab0d45f7497a80c25dca295cecd254decb204

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
```
npm test examples
```


[//]: # (## Documentation)
---
 llama_stack/providers/remote/inference/together/together.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py
index f701c0da7..2046d4aae 100644
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@@ -32,9 +32,7 @@ from llama_stack.apis.inference import (
 )
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
 from llama_stack.log import get_logger
-from llama_stack.providers.utils.inference.model_registry import (
-    ModelRegistryHelper,
-)
+from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
 from llama_stack.providers.utils.inference.openai_compat import (
     convert_message_to_openai_dict,
     get_sampling_options,
@@ -227,6 +225,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
             **self._build_options(request.sampling_params, request.logprobs, request.response_format),
         }
         logger.debug(f"params to together: {params}")
+        return params
 
     async def embeddings(
         self,

From e6355bfc3ba9dd215dec454f8db2550b3ae8b308 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Fri, 7 Mar 2025 21:54:56 +0100
Subject: [PATCH 20/58] ci: enable Dependabot for GitHub Actions (#1470)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

Add a Dependabot configuration file (.github/dependabot.yml) to enable
automated dependency updates for GitHub Actions. This ensures workflows
stay up to date with the latest versions, improving security and
reliability.

Dependabot is configured to:
- Monitor GitHub Actions dependencies.
- Check for updates in the workflow directory
- Run updates on a daily schedule.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 .github/dependabot.yml | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 .github/dependabot.yml

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 000000000..4aba604dd
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,8 @@
+# GitHub Dependabot configuration
+version: 2
+updates:
+  # Enable version updates for GitHub Actions
+  - package-ecosystem: "github-actions"
+    directory: "/" # Will use the default workflow location of `.github/workflows`
+    schedule:
+      interval: "daily"

From a55aab595872e69ff33f6defcbbfe969ed00ed98 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Fri, 7 Mar 2025 13:13:41 -0800
Subject: [PATCH 21/58] fix: fix scoring tests (#1487)

# What does this PR do?
- fix scoring test

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
```
LLAMA_STACK_CONFIG=fireworks pytest -v tests/integration/scoring/test_scoring.py --text-model meta-llama/Llama-3.3-70B-Instruct --judge-model meta-llama/Llama-3.3-70B-Instruct
```

<img width="1061" alt="image"
src="https://github.com/user-attachments/assets/740f9e6e-a654-4265-9db1-61481515a852"
/>


[//]: # (## Documentation)
---
 tests/integration/scoring/test_scoring.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/integration/scoring/test_scoring.py b/tests/integration/scoring/test_scoring.py
index ecf3b9425..2fcdf54e2 100644
--- a/tests/integration/scoring/test_scoring.py
+++ b/tests/integration/scoring/test_scoring.py
@@ -81,8 +81,6 @@ def test_scoring_functions_register(
 
 def test_scoring_score(llama_stack_client):
     register_dataset(llama_stack_client, for_rag=True)
-    response = llama_stack_client.datasets.list()
-    assert len(response) == 1
 
     # scoring individual rows
     rows = llama_stack_client.datasetio.get_rows_paginated(
@@ -119,8 +117,6 @@ def test_scoring_score(llama_stack_client):
 
 def test_scoring_score_with_params_llm_as_judge(llama_stack_client, sample_judge_prompt_template, judge_model_id):
     register_dataset(llama_stack_client, for_rag=True)
-    response = llama_stack_client.datasets.list()
-    assert len(response) == 1
 
     # scoring individual rows
     rows = llama_stack_client.datasetio.get_rows_paginated(

From acbae66b9d796fa7fc85ad2e115bde84ed70065f Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Fri, 7 Mar 2025 13:33:45 -0800
Subject: [PATCH 22/58] chore: escape tool output for logging (#1490)

Summary:

error:


llama_stack/providers/inline/agents/meta_reference/agent_instance.py:1032:
in execute_tool_call_maybe
    logger.info(f"tool call {name} completed with result: {result}")

/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/logging/__init__.py:1841:
in info
    self.log(INFO, msg, *args, **kwargs)

/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/logging/__init__.py:1879:
in log
    self.logger.log(level, msg, *args, **kwargs)

/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/logging/__init__.py:1547:
in log
    self._log(level, msg, args, **kwargs)

/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/logging/__init__.py:1624:
in _log
    self.handle(record)

/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/logging/__init__.py:1634:
in handle
    self.callHandlers(record)

/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/logging/__init__.py:1696:
in callHandlers
    hdlr.handle(record)

/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/logging/__init__.py:968:
in handle
    self.emit(record)

/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/site-packages/rich/logging.py:167:
in emit
    message_renderable = self.render_message(record, message)

/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/site-packages/rich/logging.py:193:
in render_message
message_text = Text.from_markup(message) if use_markup else
Text(message)

/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/site-packages/rich/text.py:287:
in from_markup
rendered_text = render(text, style, emoji=emoji,
emoji_variant=emoji_variant)

/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/site-packages/rich/markup.py:167:
in render
    raise MarkupError(
E rich.errors.MarkupError: closing tag '[/INST]' at position 3274
doesn't match any open tag

Test Plan:
---
 .../providers/inline/agents/meta_reference/agent_instance.py   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
index 3619b3f67..b7cba4e46 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@@ -16,6 +16,7 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
 from urllib.parse import urlparse
 
 import httpx
+from rich.markup import escape
 
 from llama_stack.apis.agents import (
     AgentConfig,
@@ -1029,7 +1030,7 @@ async def execute_tool_call_maybe(
             **toolgroup_args.get(group_name, {}),
         },
     )
-    logger.info(f"tool call {name} completed with result: {result}")
+    logger.info(f"tool call {name} completed with result: {escape(str(result))}")
     return result
 
 
From a8d0cdaf372b3f1223aa7e80e650d30ac412cfcd Mon Sep 17 00:00:00 2001
From: Fred Reiss <frreiss@us.ibm.com>
Date: Fri, 7 Mar 2025 13:38:23 -0800
Subject: [PATCH 23/58] feat: updated inline vllm inference provider (#880)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

This PR updates the inline vLLM inference provider in several
significant ways:
* Models are now attached at run time to instances of the provider via
the `.../models` API instead of hard-coding the model's full name into
the provider's YAML configuration.
* The provider supports models that are not Meta Llama models. Any model
that vLLM supports can be loaded by passing Huggingface coordinates in
the "provider_model_id" field. Custom fine-tuned versions of Meta Llama
models can be loaded by specifying a path on local disk in the
"provider_model_id".
* To implement full chat completions support, including tool calling and
constrained decoding, the provider now routes the `chat_completions` API
to a captive (i.e. called directly in-process, not via HTTPS) instance
of vLLM's OpenAI-compatible server .
* The `logprobs` parameter and completions API are also working.

## Test Plan

Existing tests in
`llama_stack/providers/tests/inference/test_text_inference.py` have good
coverage of the new functionality. These tests can be invoked as
follows:

```
cd llama-stack && pytest \
    -vvv \
    llama_stack/providers/tests/inference/test_text_inference.py \
    --providers inference=vllm \
    --inference-model meta-llama/Llama-3.2-3B-Instruct
====================================== test session starts ======================================
platform linux -- Python 3.12.8, pytest-8.3.4, pluggy-1.5.0 -- /mnt/datadisk1/freiss/llama/env/bin/python3.12
cachedir: .pytest_cache
metadata: {'Python': '3.12.8', 'Platform': 'Linux-6.8.0-1016-ibm-x86_64-with-glibc2.39', 'Packages': {'pytest': '8.3.4', 'pluggy': '1.5.0'}, 'Plugins': {'anyio': '4.8.0', 'html': '4.1.1', 'metadata': '3.1.1', 'asyncio': '0.25.2'}, 'JAVA_HOME': '/usr/lib/jvm/java-8-openjdk-amd64'}
rootdir: /mnt/datadisk1/freiss/llama/llama-stack
configfile: pyproject.toml
plugins: anyio-4.8.0, html-4.1.1, metadata-3.1.1, asyncio-0.25.2
asyncio: mode=Mode.STRICT, asyncio_default_fixture_loop_scope=None
collected 9 items

llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_model_list[-vllm] PASSED [ 11%]
llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_completion[-vllm] PASSED [ 22%]
llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_completion_logprobs[-vllm] PASSED [ 33%]
llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_completion_structured_output[-vllm] PASSED [ 44%]
llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_chat_completion_non_streaming[-vllm] PASSED [ 55%]
llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_structured_output[-vllm] PASSED [ 66%]
llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_chat_completion_streaming[-vllm] PASSED [ 77%]
llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_chat_completion_with_tool_calling[-vllm] PASSED [ 88%]
llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_chat_completion_with_tool_calling_streaming[-vllm] PASSED [100%]

=========================== 9 passed, 13 warnings in 97.18s (0:01:37) ===========================

```

## Sources


## Before submitting

- [X] Ran pre-commit to handle lint / formatting issues.
- [X] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [ ] Updated relevant documentation.
- [ ] Wrote necessary unit or integration tests.

---------

Co-authored-by: Sébastien Han <seb@redhat.com>
Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
---
 .../providers/inline/inference/vllm/config.py |  36 +-
 .../inline/inference/vllm/openai_utils.py     | 170 ++++
 .../providers/inline/inference/vllm/vllm.py   | 852 ++++++++++++++----
 llama_stack/templates/vllm-gpu/run.yaml       |   5 +-
 4 files changed, 887 insertions(+), 176 deletions(-)
 create mode 100644 llama_stack/providers/inline/inference/vllm/openai_utils.py

diff --git a/llama_stack/providers/inline/inference/vllm/config.py b/llama_stack/providers/inline/inference/vllm/config.py
index 51ef2d273..0e85c9a48 100644
--- a/llama_stack/providers/inline/inference/vllm/config.py
+++ b/llama_stack/providers/inline/inference/vllm/config.py
@@ -4,20 +4,19 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from pydantic import BaseModel, Field, field_validator
+from pydantic import BaseModel, Field
 
-from llama_stack.providers.utils.inference import supported_inference_models
 from llama_stack.schema_utils import json_schema_type
 
 
 @json_schema_type
 class VLLMConfig(BaseModel):
-    """Configuration for the vLLM inference provider."""
+    """Configuration for the vLLM inference provider.
+
+    Note that the model name is no longer part of this static configuration.
+    You can bind an instance of this provider to a specific model with the
+    ``models.register()`` API call."""
 
-    model: str = Field(
-        default="Llama3.2-3B-Instruct",
-        description="Model descriptor from `llama model list`",
-    )
     tensor_parallel_size: int = Field(
         default=1,
         description="Number of tensor parallel replicas (number of GPUs to use).",
@@ -26,32 +25,27 @@ class VLLMConfig(BaseModel):
         default=4096,
         description="Maximum number of tokens to generate.",
     )
+    max_model_len: int = Field(default=4096, description="Maximum context length to use during serving.")
+    max_num_seqs: int = Field(default=4, description="Maximum parallel batch size for generation.")
     enforce_eager: bool = Field(
         default=False,
         description="Whether to use eager mode for inference (otherwise cuda graphs are used).",
     )
     gpu_memory_utilization: float = Field(
         default=0.3,
+        description=(
+            "How much GPU memory will be allocated when this provider has finished "
+            "loading, including memory that was already allocated before loading."
+        ),
     )
 
     @classmethod
     def sample_run_config(cls):
         return {
-            "model": "${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}",
             "tensor_parallel_size": "${env.TENSOR_PARALLEL_SIZE:1}",
             "max_tokens": "${env.MAX_TOKENS:4096}",
+            "max_model_len": "${env.MAX_MODEL_LEN:4096}",
+            "max_num_seqs": "${env.MAX_NUM_SEQS:4}",
             "enforce_eager": "${env.ENFORCE_EAGER:False}",
-            "gpu_memory_utilization": "${env.GPU_MEMORY_UTILIZATION:0.7}",
+            "gpu_memory_utilization": "${env.GPU_MEMORY_UTILIZATION:0.3}",
         }
-
-    @field_validator("model")
-    @classmethod
-    def validate_model(cls, model: str) -> str:
-        permitted_models = supported_inference_models()
-
-        descriptors = [m.descriptor() for m in permitted_models]
-        repos = [m.huggingface_repo for m in permitted_models]
-        if model not in (descriptors + repos):
-            model_list = "\n\t".join(repos)
-            raise ValueError(f"Unknown model: `{model}`. Choose from [\n\t{model_list}\n]")
-        return model
diff --git a/llama_stack/providers/inline/inference/vllm/openai_utils.py b/llama_stack/providers/inline/inference/vllm/openai_utils.py
new file mode 100644
index 000000000..90b5398f9
--- /dev/null
+++ b/llama_stack/providers/inline/inference/vllm/openai_utils.py
@@ -0,0 +1,170 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List, Optional
+
+import vllm
+
+from llama_stack.apis.inference import (
+    ChatCompletionRequest,
+    GrammarResponseFormat,
+    JsonSchemaResponseFormat,
+    Message,
+    ToolChoice,
+    UserMessage,
+)
+from llama_stack.models.llama.datatypes import BuiltinTool, ToolDefinition
+from llama_stack.providers.utils.inference.openai_compat import (
+    convert_message_to_openai_dict,
+    get_sampling_options,
+)
+
+###############################################################################
+# This file contains OpenAI compatibility code that is currently only used
+# by the inline vLLM connector. Some or all of this code may be moved to a
+# central location at a later date.
+
+
+def _merge_context_into_content(message: Message) -> Message:  # type: ignore
+    """
+    Merge the ``context`` field of a Llama Stack ``Message`` object into
+    the content field for compabilitiy with OpenAI-style APIs.
+
+    Generates a content string that emulates the current behavior
+    of ``llama_models.llama3.api.chat_format.encode_message()``.
+
+    :param message: Message that may include ``context`` field
+
+    :returns: A version of ``message`` with any context merged into the
+     ``content`` field.
+    """
+    if not isinstance(message, UserMessage):  # Separate type check for linter
+        return message
+    if message.context is None:
+        return message
+    return UserMessage(
+        role=message.role,
+        # Emumate llama_models.llama3.api.chat_format.encode_message()
+        content=message.content + "\n\n" + message.context,
+        context=None,
+    )
+
+
+def _llama_stack_tools_to_openai_tools(
+    tools: Optional[List[ToolDefinition]] = None,
+) -> List[vllm.entrypoints.openai.protocol.ChatCompletionToolsParam]:
+    """
+    Convert the list of available tools from Llama Stack's format to vLLM's
+    version of OpenAI's format.
+    """
+    if tools is None:
+        return []
+
+    result = []
+    for t in tools:
+        if isinstance(t.tool_name, BuiltinTool):
+            raise NotImplementedError("Built-in tools not yet implemented")
+        if t.parameters is None:
+            parameters = None
+        else:  # if t.parameters is not None
+            # Convert the "required" flags to a list of required params
+            required_params = [k for k, v in t.parameters.items() if v.required]
+            parameters = {
+                "type": "object",  # Mystery value that shows up in OpenAI docs
+                "properties": {
+                    k: {"type": v.param_type, "description": v.description} for k, v in t.parameters.items()
+                },
+                "required": required_params,
+            }
+
+        function_def = vllm.entrypoints.openai.protocol.FunctionDefinition(
+            name=t.tool_name, description=t.description, parameters=parameters
+        )
+
+        # Every tool definition is double-boxed in a ChatCompletionToolsParam
+        result.append(vllm.entrypoints.openai.protocol.ChatCompletionToolsParam(function=function_def))
+    return result
+
+
+async def llama_stack_chat_completion_to_openai_chat_completion_dict(
+    request: ChatCompletionRequest,
+) -> dict:
+    """
+    Convert a chat completion request in Llama Stack format into an
+    equivalent set of arguments to pass to an OpenAI-compatible
+    chat completions API.
+
+    :param request: Bundled request parameters in Llama Stack format.
+
+    :returns: Dictionary of key-value pairs to use as an initializer
+     for a dataclass or to be converted directly to JSON and sent
+     over the wire.
+    """
+
+    converted_messages = [
+        # This mystery async call makes the parent function also be async
+        await convert_message_to_openai_dict(_merge_context_into_content(m), download=True)
+        for m in request.messages
+    ]
+    converted_tools = _llama_stack_tools_to_openai_tools(request.tools)
+
+    # Llama will try to use built-in tools with no tool catalog, so don't enable
+    # tool choice unless at least one tool is enabled.
+    converted_tool_choice = "none"
+    if (
+        request.tool_config is not None
+        and request.tool_config.tool_choice == ToolChoice.auto
+        and request.tools is not None
+        and len(request.tools) > 0
+    ):
+        converted_tool_choice = "auto"
+
+    # TODO: Figure out what to do with the tool_prompt_format argument.
+    #  Other connectors appear to drop it quietly.
+
+    # Use Llama Stack shared code to translate sampling parameters.
+    sampling_options = get_sampling_options(request.sampling_params)
+
+    # get_sampling_options() translates repetition penalties to an option that
+    # OpenAI's APIs don't know about.
+    # vLLM's OpenAI-compatible API also handles repetition penalties wrong.
+    # For now, translate repetition penalties into a format that vLLM's broken
+    # API will handle correctly. Two wrongs make a right...
+    if "repeat_penalty" in sampling_options:
+        del sampling_options["repeat_penalty"]
+    if request.sampling_params.repetition_penalty is not None and request.sampling_params.repetition_penalty != 1.0:
+        sampling_options["repetition_penalty"] = request.sampling_params.repetition_penalty
+
+    # Convert a single response format into four different parameters, per
+    # the OpenAI spec
+    guided_decoding_options = dict()
+    if request.response_format is None:
+        # Use defaults
+        pass
+    elif isinstance(request.response_format, JsonSchemaResponseFormat):
+        guided_decoding_options["guided_json"] = request.response_format.json_schema
+    elif isinstance(request.response_format, GrammarResponseFormat):
+        guided_decoding_options["guided_grammar"] = request.response_format.bnf
+    else:
+        raise TypeError(f"ResponseFormat object is of unexpected subtype '{type(request.response_format)}'")
+
+    logprob_options = dict()
+    if request.logprobs is not None:
+        logprob_options["logprobs"] = request.logprobs.top_k
+
+    # Marshall together all the arguments for a ChatCompletionRequest
+    request_options = {
+        "model": request.model,
+        "messages": converted_messages,
+        "tools": converted_tools,
+        "tool_choice": converted_tool_choice,
+        "stream": request.stream,
+        **sampling_options,
+        **guided_decoding_options,
+        **logprob_options,
+    }
+
+    return request_options
diff --git a/llama_stack/providers/inline/inference/vllm/vllm.py b/llama_stack/providers/inline/inference/vllm/vllm.py
index b461bf44a..b59df13d0 100644
--- a/llama_stack/providers/inline/inference/vllm/vllm.py
+++ b/llama_stack/providers/inline/inference/vllm/vllm.py
@@ -4,45 +4,71 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import logging
-import os
+import json
+import re
 import uuid
-from typing import AsyncGenerator, List, Optional
+from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
 
+# These vLLM modules contain names that overlap with Llama Stack names, so we import
+# fully-qualified names
+import vllm.entrypoints.openai.protocol
+import vllm.sampling_params
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.sampling_params import SamplingParams as VLLMSamplingParams
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 
-from llama_stack.apis.common.content_types import InterleavedContent
+from llama_stack.apis.common.content_types import (
+    InterleavedContent,
+    InterleavedContentItem,
+    TextDelta,
+    ToolCallDelta,
+)
 from llama_stack.apis.inference import (
     ChatCompletionRequest,
     ChatCompletionResponse,
+    ChatCompletionResponseEvent,
+    ChatCompletionResponseEventType,
     ChatCompletionResponseStreamChunk,
+    CompletionMessage,
     CompletionResponse,
     CompletionResponseStreamChunk,
     EmbeddingsResponse,
     EmbeddingTaskType,
+    GrammarResponseFormat,
     Inference,
-    InterleavedContentItem,
+    JsonSchemaResponseFormat,
     LogProbConfig,
     Message,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
+    TokenLogProbs,
     ToolChoice,
     ToolConfig,
-    ToolDefinition,
-    ToolPromptFormat,
 )
 from llama_stack.apis.models import Model
+from llama_stack.log import get_logger
+from llama_stack.models.llama import sku_list
+from llama_stack.models.llama.datatypes import (
+    StopReason,
+    ToolCall,
+    ToolDefinition,
+    ToolPromptFormat,
+    TopKSamplingStrategy,
+    TopPSamplingStrategy,
+)
+from llama_stack.models.llama.llama3.chat_format import ChatFormat
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer
-from llama_stack.models.llama.sku_list import resolve_model
-from llama_stack.providers.datatypes import ModelsProtocolPrivate
+from llama_stack.providers.remote.inference.vllm.vllm import build_hf_repo_model_entries
+from llama_stack.providers.utils.inference.model_registry import (
+    ModelRegistryHelper,
+    ModelsProtocolPrivate,
+)
 from llama_stack.providers.utils.inference.openai_compat import (
     OpenAICompatCompletionChoice,
     OpenAICompatCompletionResponse,
-    get_sampling_options,
-    process_chat_completion_response,
+    get_stop_reason,
     process_chat_completion_stream_response,
 )
 from llama_stack.providers.utils.inference.prompt_adapter import (
@@ -50,94 +76,288 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 )
 
 from .config import VLLMConfig
+from .openai_utils import llama_stack_chat_completion_to_openai_chat_completion_dict
 
-log = logging.getLogger(__name__)
+# Map from Hugging Face model architecture name to appropriate tool parser.
+# See vllm.entrypoints.openai.tool_parsers.ToolParserManager.tool_parsers for the full list of
+# available parsers.
+# TODO: Expand this list
+CONFIG_TYPE_TO_TOOL_PARSER = {
+    "GraniteConfig": "granite",
+    "MllamaConfig": "llama3_json",
+    "LlamaConfig": "llama3_json",
+}
+DEFAULT_TOOL_PARSER = "pythonic"
 
 
-def _random_uuid() -> str:
+logger = get_logger(__name__, category="inference")
+
+
+def _random_uuid_str() -> str:
     return str(uuid.uuid4().hex)
 
 
+def _response_format_to_guided_decoding_params(
+    response_format: Optional[ResponseFormat],  # type: ignore
+) -> vllm.sampling_params.GuidedDecodingParams:
+    """
+    Translate constrained decoding parameters from Llama Stack's format to vLLM's format.
+
+    :param response_format: Llama Stack version of constrained decoding info. Can be ``None``,
+     indicating no constraints.
+    :returns: The equivalent dataclass object for the low-level inference layer of vLLM.
+    """
+    if response_format is None:
+        # As of vLLM 0.6.3, the default constructor for GuidedDecodingParams() returns an invalid
+        # value that crashes the executor on some code paths. Use ``None`` instead.
+        return None
+
+    # Llama Stack currently implements fewer types of constrained decoding than vLLM does.
+    # Translate the types that exist and detect if Llama Stack adds new ones.
+    if isinstance(response_format, JsonSchemaResponseFormat):
+        return vllm.sampling_params.GuidedDecodingParams(json=response_format.json_schema)
+    elif isinstance(response_format, GrammarResponseFormat):
+        # BNF grammar.
+        # Llama Stack uses the parse tree of the grammar, while vLLM uses the string
+        # representation of the grammar.
+        raise TypeError(
+            "Constrained decoding with BNF grammars is not currently implemented, because the "
+            "reference implementation does not implement it."
+        )
+    else:
+        raise TypeError(f"ResponseFormat object is of unexpected subtype '{type(response_format)}'")
+
+
+def _convert_sampling_params(
+    sampling_params: Optional[SamplingParams],
+    response_format: Optional[ResponseFormat],  # type: ignore
+    log_prob_config: Optional[LogProbConfig],
+) -> vllm.SamplingParams:
+    """Convert sampling and constrained decoding configuration from Llama Stack's format to vLLM's
+    format."""
+    # In the absence of provided config values, use Llama Stack defaults as encoded in the Llama
+    # Stack dataclasses. These defaults are different from vLLM's defaults.
+    if sampling_params is None:
+        sampling_params = SamplingParams()
+    if log_prob_config is None:
+        log_prob_config = LogProbConfig()
+
+    if isinstance(sampling_params.strategy, TopKSamplingStrategy):
+        if sampling_params.strategy.top_k == 0:
+            # vLLM treats "k" differently for top-k sampling
+            vllm_top_k = -1
+        else:
+            vllm_top_k = sampling_params.strategy.top_k
+    else:
+        vllm_top_k = -1
+
+    if isinstance(sampling_params.strategy, TopPSamplingStrategy):
+        vllm_top_p = sampling_params.strategy.top_p
+        # Llama Stack only allows temperature with top-P.
+        vllm_temperature = sampling_params.strategy.temperature
+    else:
+        vllm_top_p = 1.0
+        vllm_temperature = 0.0
+
+    # vLLM allows top-p and top-k at the same time.
+    vllm_sampling_params = vllm.SamplingParams.from_optional(
+        max_tokens=(None if sampling_params.max_tokens == 0 else sampling_params.max_tokens),
+        temperature=vllm_temperature,
+        top_p=vllm_top_p,
+        top_k=vllm_top_k,
+        repetition_penalty=sampling_params.repetition_penalty,
+        guided_decoding=_response_format_to_guided_decoding_params(response_format),
+        logprobs=log_prob_config.top_k,
+    )
+    return vllm_sampling_params
+
+
 class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
-    """Inference implementation for vLLM."""
+    """
+    vLLM-based inference model adapter for Llama Stack with support for multiple models.
+
+    Requires the configuration parameters documented in the :class:`VllmConfig2` class.
+    """
+
+    config: VLLMConfig
+    register_helper: ModelRegistryHelper
+    model_ids: set[str]
+    resolved_model_id: str | None
+    engine: AsyncLLMEngine | None
+    chat: OpenAIServingChat | None
+    is_meta_llama_model: bool
 
     def __init__(self, config: VLLMConfig):
         self.config = config
+        logger.info(f"Config is: {self.config}")
+
+        self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries())
+        self.formatter = ChatFormat(Tokenizer.get_instance())
+
+        # The following are initialized when paths are bound to this provider
+        self.resolved_model_id = None
+        self.model_ids = set()
         self.engine = None
+        self.chat = None
+        self.is_meta_llama_model = False
 
-    async def initialize(self):
-        log.info("Initializing vLLM inference provider.")
+    ###########################################################################
+    # METHODS INHERITED FROM IMPLICIT BASE CLASS.
+    # TODO: Make this class inherit from the new base class ProviderBase once that class exists.
 
-        # Disable usage stats reporting. This would be a surprising thing for most
-        # people to find out was on by default.
-        # https://docs.vllm.ai/en/latest/serving/usage_stats.html
-        if "VLLM_NO_USAGE_STATS" not in os.environ:
-            os.environ["VLLM_NO_USAGE_STATS"] = "1"
+    async def initialize(self) -> None:
+        """
+        Callback that is invoked through many levels of indirection during provider class
+        instantiation, sometime after when __init__() is called and before any model registration
+        methods or methods connected to a REST API are called.
 
-        model = resolve_model(self.config.model)
-        if model is None:
-            raise ValueError(f"Unknown model {self.config.model}")
+        It's not clear what assumptions the class can make about the platform's initialization
+        state here that can't be made during __init__(), and vLLM can't be started until we know
+        what model it's supposed to be serving, so nothing happens here currently.
+        """
+        pass
 
-        if model.huggingface_repo is None:
-            raise ValueError(f"Model {self.config.model} needs a huggingface repo")
-
-        # TODO -- there are a ton of options supported here ...
-        engine_args = AsyncEngineArgs(
-            model=model.huggingface_repo,
-            tokenizer=model.huggingface_repo,
-            tensor_parallel_size=self.config.tensor_parallel_size,
-            enforce_eager=self.config.enforce_eager,
-            gpu_memory_utilization=self.config.gpu_memory_utilization,
-            guided_decoding_backend="lm-format-enforcer",
-        )
-
-        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
-
-    async def shutdown(self):
-        """Shut down the vLLM inference adapter."""
-        log.info("Shutting down vLLM inference provider.")
-        if self.engine:
+    async def shutdown(self) -> None:
+        logger.info(f"Shutting down inline vLLM inference provider {self}.")
+        if self.engine is not None:
             self.engine.shutdown_background_loop()
+            self.engine = None
+            self.chat = None
+            self.model_ids = set()
+            self.resolved_model_id = None
+
+    ###########################################################################
+    # METHODS INHERITED FROM ModelsProtocolPrivate INTERFACE
 
     # Note that the return type of the superclass method is WRONG
     async def register_model(self, model: Model) -> Model:
         """
-        Callback that is called when the server associates an inference endpoint
-        with an inference provider.
+        Callback that is called when the server associates an inference endpoint with an
+        inference provider.
 
-        :param model: Object that encapsulates parameters necessary for identifying
-         a specific LLM.
+        :param model: Object that encapsulates parameters necessary for identifying a specific
+         LLM.
 
-        :returns: The input ``Model`` object. It may or may not be permissible
-         to change fields before returning this object.
+        :returns: The input ``Model`` object. It may or may not be permissible to change fields
+         before returning this object.
         """
-        log.info(f"Registering model {model.identifier} with vLLM inference provider.")
-        # The current version of this provided is hard-coded to serve only
-        # the model specified in the YAML config file.
-        configured_model = resolve_model(self.config.model)
-        registered_model = resolve_model(model.model_id)
+        logger.debug(f"In register_model({model})")
+
+        # First attempt to interpret the model coordinates as a Llama model name
+        resolved_llama_model = sku_list.resolve_model(model.provider_model_id)
+        if resolved_llama_model is not None:
+            # Load from Hugging Face repo into default local cache dir
+            model_id_for_vllm = resolved_llama_model.huggingface_repo
+
+            # Detect a genuine Meta Llama model to trigger Meta-specific preprocessing.
+            # Don't set self.is_meta_llama_model until we actually load the model.
+            is_meta_llama_model = True
+        else:  # if resolved_llama_model is None
+            # Not a Llama model name. Pass the model id through to vLLM's loader
+            model_id_for_vllm = model.provider_model_id
+            is_meta_llama_model = False
+
+        if self.resolved_model_id is not None:
+            if model_id_for_vllm != self.resolved_model_id:
+                raise ValueError(
+                    f"Attempted to serve two LLMs (ids '{self.resolved_model_id}') and "
+                    f"'{model_id_for_vllm}') from one copy of provider '{self}'. Use multiple "
+                    f"copies of the provider instead."
+                )
+            else:
+                # Model already loaded
+                logger.info(
+                    f"Requested id {model} resolves to {model_id_for_vllm}, which is already loaded. Continuing."
+                )
+                self.model_ids.add(model.model_id)
+                return model
+
+        logger.info(f"Requested id {model} resolves to {model_id_for_vllm}. Loading {model_id_for_vllm}.")
+        if is_meta_llama_model:
+            logger.info(f"Model {model_id_for_vllm} is a Meta Llama model.")
+        self.is_meta_llama_model = is_meta_llama_model
+
+        # If we get here, this is the first time registering a model.
+        # Preload so that the first inference request won't time out.
+        engine_args = AsyncEngineArgs(
+            model=model_id_for_vllm,
+            tokenizer=model_id_for_vllm,
+            tensor_parallel_size=self.config.tensor_parallel_size,
+            enforce_eager=self.config.enforce_eager,
+            gpu_memory_utilization=self.config.gpu_memory_utilization,
+            max_num_seqs=self.config.max_num_seqs,
+            max_model_len=self.config.max_model_len,
+        )
+        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
+
+        # vLLM currently requires the user to specify the tool parser manually. To choose a tool
+        # parser, we need to determine what model architecture is being used. For now, we infer
+        # that information from what config class the model uses.
+        low_level_model_config = self.engine.engine.get_model_config()
+        hf_config = low_level_model_config.hf_config
+        hf_config_class_name = hf_config.__class__.__name__
+        if hf_config_class_name in CONFIG_TYPE_TO_TOOL_PARSER:
+            tool_parser = CONFIG_TYPE_TO_TOOL_PARSER[hf_config_class_name]
+        else:
+            # No info -- choose a default so we can at least attempt tool
+            # use.
+            tool_parser = DEFAULT_TOOL_PARSER
+        logger.debug(f"{hf_config_class_name=}")
+        logger.debug(f"{tool_parser=}")
+
+        # Wrap the lower-level engine in an OpenAI-compatible chat API
+        model_config = await self.engine.get_model_config()
+        self.chat = OpenAIServingChat(
+            engine_client=self.engine,
+            model_config=model_config,
+            models=OpenAIServingModels(
+                engine_client=self.engine,
+                model_config=model_config,
+                base_model_paths=[
+                    # The layer below us will only see resolved model IDs
+                    BaseModelPath(model_id_for_vllm, model_id_for_vllm)
+                ],
+            ),
+            response_role="assistant",
+            request_logger=None,  # Use default logging
+            chat_template=None,  # Use default template from model checkpoint
+            enable_auto_tools=True,
+            tool_parser=tool_parser,
+            chat_template_content_format="auto",
+        )
+        self.resolved_model_id = model_id_for_vllm
+        self.model_ids.add(model.model_id)
+
+        logger.info(f"Finished preloading model: {model_id_for_vllm}")
 
-        if configured_model.core_model_id != registered_model.core_model_id:
-            raise ValueError(
-                f"Requested model '{model.identifier}' is different from "
-                f"model '{self.config.model}' that this provider "
-                f"is configured to serve"
-            )
         return model
 
-    def _sampling_params(self, sampling_params: SamplingParams) -> VLLMSamplingParams:
-        if sampling_params is None:
-            return VLLMSamplingParams(max_tokens=self.config.max_tokens)
-
-        options = get_sampling_options(sampling_params)
-        if "repeat_penalty" in options:
-            options["repetition_penalty"] = options["repeat_penalty"]
-            del options["repeat_penalty"]
-
-        return VLLMSamplingParams(**options)
-
     async def unregister_model(self, model_id: str) -> None:
-        pass
+        """
+        Callback that is called when the server removes an inference endpoint from an inference
+        provider.
+
+        :param model_id: The same external ID that the higher layers of the stack previously passed
+        to :func:`register_model()`
+        """
+        if model_id not in self.model_ids:
+            raise ValueError(
+                f"Attempted to unregister model ID '{model_id}', but that ID is not registered to this provider."
+            )
+        self.model_ids.remove(model_id)
+
+        if len(self.model_ids) == 0:
+            # Last model was just unregistered. Shut down the connection to vLLM and free up
+            # resources.
+            # Note that this operation may cause in-flight chat completion requests on the
+            # now-unregistered model to return errors.
+            self.resolved_model_id = None
+            self.chat = None
+            self.engine.shutdown_background_loop()
+            self.engine = None
+
+    ###########################################################################
+    # METHODS INHERITED FROM Inference INTERFACE
 
     async def completion(
         self,
@@ -147,93 +367,31 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
         response_format: Optional[ResponseFormat] = None,
         stream: Optional[bool] = False,
         logprobs: Optional[LogProbConfig] = None,
-    ) -> CompletionResponse | CompletionResponseStreamChunk:
-        raise NotImplementedError("Completion not implemented for vLLM")
-
-    async def chat_completion(
-        self,
-        model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
-    ) -> ChatCompletionResponse | ChatCompletionResponseStreamChunk:
+    ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]:
+        if model_id not in self.model_ids:
+            raise ValueError(
+                f"This adapter is not registered to model id '{model_id}'. Registered IDs are: {self.model_ids}"
+            )
+        if not isinstance(content, str):
+            raise NotImplementedError("Multimodal input not currently supported")
         if sampling_params is None:
             sampling_params = SamplingParams()
-        assert self.engine is not None
 
-        request = ChatCompletionRequest(
-            model=model_id,
-            messages=messages,
-            sampling_params=sampling_params,
-            tools=tools or [],
-            stream=stream,
-            logprobs=logprobs,
-            tool_config=tool_config,
-        )
+        converted_sampling_params = _convert_sampling_params(sampling_params, response_format, logprobs)
 
-        log.info("Sampling params: %s", sampling_params)
-        request_id = _random_uuid()
+        logger.debug(f"{converted_sampling_params=}")
 
-        prompt = await chat_completion_request_to_prompt(request, self.config.model)
-        vllm_sampling_params = self._sampling_params(request.sampling_params)
-        results_generator = self.engine.generate(prompt, vllm_sampling_params, request_id)
         if stream:
-            return self._stream_chat_completion(request, results_generator)
+            return self._streaming_completion(content, converted_sampling_params)
         else:
-            return await self._nonstream_chat_completion(request, results_generator)
-
-    async def _nonstream_chat_completion(
-        self, request: ChatCompletionRequest, results_generator: AsyncGenerator
-    ) -> ChatCompletionResponse:
-        outputs = [o async for o in results_generator]
-        final_output = outputs[-1]
-
-        assert final_output is not None
-        outputs = final_output.outputs
-        finish_reason = outputs[-1].stop_reason
-        choice = OpenAICompatCompletionChoice(
-            finish_reason=finish_reason,
-            text="".join([output.text for output in outputs]),
-        )
-        response = OpenAICompatCompletionResponse(
-            choices=[choice],
-        )
-        return process_chat_completion_response(response, request)
-
-    async def _stream_chat_completion(
-        self, request: ChatCompletionRequest, results_generator: AsyncGenerator
-    ) -> AsyncGenerator:
-        tokenizer = Tokenizer.get_instance()
-
-        async def _generate_and_convert_to_openai_compat():
-            cur = []
-            async for chunk in results_generator:
-                if not chunk.outputs:
-                    log.warning("Empty chunk received")
-                    continue
-
-                output = chunk.outputs[-1]
-
-                new_tokens = output.token_ids[len(cur) :]
-                text = tokenizer.decode(new_tokens)
-                cur.extend(new_tokens)
-                choice = OpenAICompatCompletionChoice(
-                    finish_reason=output.finish_reason,
-                    text=text,
-                )
-                yield OpenAICompatCompletionResponse(
-                    choices=[choice],
-                )
-
-        stream = _generate_and_convert_to_openai_compat()
-        async for chunk in process_chat_completion_stream_response(stream, request):
-            yield chunk
+            streaming_result = None
+            async for _ in self._streaming_completion(content, converted_sampling_params):
+                pass
+            return CompletionResponse(
+                content=streaming_result.delta,
+                stop_reason=streaming_result.stop_reason,
+                logprobs=streaming_result.logprobs,
+            )
 
     async def embeddings(
         self,
@@ -244,3 +402,391 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
         task_type: Optional[EmbeddingTaskType] = None,
     ) -> EmbeddingsResponse:
         raise NotImplementedError()
+
+    async def chat_completion(
+        self,
+        model_id: str,
+        messages: List[Message],  # type: ignore
+        sampling_params: Optional[SamplingParams] = None,
+        response_format: Optional[ResponseFormat] = None,  # type: ignore
+        tools: Optional[List[ToolDefinition]] = None,
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = None,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+        tool_config: Optional[ToolConfig] = None,
+    ) -> ChatCompletionResponse | ChatCompletionResponseStreamChunk:
+        sampling_params = sampling_params or SamplingParams()
+        if model_id not in self.model_ids:
+            raise ValueError(
+                f"This adapter is not registered to model id '{model_id}'. Registered IDs are: {self.model_ids}"
+            )
+
+        # Convert to Llama Stack internal format for consistency
+        request = ChatCompletionRequest(
+            model=self.resolved_model_id,
+            messages=messages,
+            sampling_params=sampling_params,
+            response_format=response_format,
+            tools=tools,
+            tool_choice=tool_choice,
+            tool_prompt_format=tool_prompt_format,
+            stream=stream,
+            logprobs=logprobs,
+        )
+
+        if self.is_meta_llama_model:
+            # Bypass vLLM chat templating layer for Meta Llama models, because the
+            # templating layer in Llama Stack currently produces better results.
+            logger.debug(
+                f"Routing {self.resolved_model_id} chat completion through "
+                f"Llama Stack's templating layer instead of vLLM's."
+            )
+            return await self._chat_completion_for_meta_llama(request)
+
+        logger.debug(f"{self.resolved_model_id} is not a Meta Llama model")
+
+        # Arguments to the vLLM call must be packaged as a ChatCompletionRequest dataclass.
+        # Note that this dataclass has the same name as a similar dataclass in Llama Stack.
+        request_options = await llama_stack_chat_completion_to_openai_chat_completion_dict(request)
+        chat_completion_request = vllm.entrypoints.openai.protocol.ChatCompletionRequest(**request_options)
+
+        logger.debug(f"Converted request: {chat_completion_request}")
+
+        vllm_result = await self.chat.create_chat_completion(chat_completion_request)
+        logger.debug(f"Result from vLLM: {vllm_result}")
+        if isinstance(vllm_result, vllm.entrypoints.openai.protocol.ErrorResponse):
+            raise ValueError(f"Error from vLLM layer: {vllm_result}")
+
+        # Return type depends on "stream" argument
+        if stream:
+            if not isinstance(vllm_result, AsyncGenerator):
+                raise TypeError(f"Unexpected result type {type(vllm_result)} for streaming inference call")
+            # vLLM client returns a stream of strings, which need to be parsed.
+            # Stream comes in the form of an async generator.
+            return self._convert_streaming_results(vllm_result)
+        else:
+            if not isinstance(vllm_result, vllm.entrypoints.openai.protocol.ChatCompletionResponse):
+                raise TypeError(f"Unexpected result type {type(vllm_result)} for non-streaming inference call")
+            return self._convert_non_streaming_results(vllm_result)
+
+    ###########################################################################
+    # INTERNAL METHODS
+
+    async def _streaming_completion(
+        self, content: str, sampling_params: vllm.SamplingParams
+    ) -> AsyncIterator[CompletionResponseStreamChunk]:
+        """Internal implementation of :func:`completion()` API for the streaming case. Assumes
+        that arguments have been validated upstream.
+
+        :param content: Must be a string
+        :param sampling_params: Paramters from  public API's ``response_format``
+         and ``sampling_params`` arguments, converted to VLLM format
+        """
+        # We run agains the vLLM generate() call directly instead of using the OpenAI-compatible
+        # layer, because doing so simplifies the code here.
+
+        # The vLLM engine requires a unique identifier for each call to generate()
+        request_id = _random_uuid_str()
+
+        # The vLLM generate() API is streaming-only and returns an async generator.
+        # The generator returns objects of type vllm.RequestOutput.
+        results_generator = self.engine.generate(content, sampling_params, request_id)
+
+        # Need to know the model's EOS token ID for the conversion code below.
+        # AsyncLLMEngine is a wrapper around LLMEngine, and the tokenizer is only available if
+        # we drill down to the LLMEngine inside the AsyncLLMEngine.
+        # Similarly, the tokenizer in an LLMEngine is a wrapper around a BaseTokenizerGroup,
+        # and we need to drill down to the Hugging Face tokenizer inside the BaseTokenizerGroup.
+        llm_engine = self.engine.engine
+        tokenizer_group = llm_engine.tokenizer
+        eos_token_id = tokenizer_group.tokenizer.eos_token_id
+
+        request_output: vllm.RequestOutput = None
+        async for request_output in results_generator:
+            # Check for weird inference failures
+            if request_output.outputs is None or len(request_output.outputs) == 0:
+                # This case also should never happen
+                raise ValueError("Inference produced empty result")
+
+            # If we get here, then request_output contains the final output of the generate() call.
+            # The result may include multiple alternate outputs, but Llama Stack APIs only allow
+            # us to return one.
+            output: vllm.CompletionOutput = request_output.outputs[0]
+            completion_string = output.text
+
+            # Convert logprobs from vLLM's format to Llama Stack's format
+            logprobs = [
+                TokenLogProbs(logprobs_by_token={v.decoded_token: v.logprob for _, v in logprob_dict.items()})
+                for logprob_dict in output.logprobs
+            ]
+
+            # The final output chunk should be labeled with the reason that the overall generate()
+            # call completed.
+            logger.debug(f"{output.stop_reason=}; {type(output.stop_reason)=}")
+            if output.stop_reason is None:
+                stop_reason = None  # Still going
+            elif output.stop_reason == "stop":
+                stop_reason = StopReason.end_of_turn
+            elif output.stop_reason == "length":
+                stop_reason = StopReason.out_of_tokens
+            elif isinstance(output.stop_reason, int):
+                # If the model config specifies multiple end-of-sequence tokens, then vLLM
+                # will return the token ID of the EOS token in the stop_reason field.
+                stop_reason = StopReason.end_of_turn
+            else:
+                raise ValueError(f"Unrecognized stop reason '{output.stop_reason}'")
+
+            # vLLM's protocol outputs the stop token, then sets end of message on the next step for
+            # some reason.
+            if request_output.outputs[-1].token_ids[-1] == eos_token_id:
+                stop_reason = StopReason.end_of_message
+
+            yield CompletionResponseStreamChunk(delta=completion_string, stop_reason=stop_reason, logprobs=logprobs)
+
+        # Llama Stack requires that the last chunk have a stop reason, but vLLM doesn't always
+        # provide one if it runs out of tokens.
+        if stop_reason is None:
+            yield CompletionResponseStreamChunk(
+                delta=completion_string,
+                stop_reason=StopReason.out_of_tokens,
+                logprobs=logprobs,
+            )
+
+    def _convert_non_streaming_results(
+        self, vllm_result: vllm.entrypoints.openai.protocol.ChatCompletionResponse
+    ) -> ChatCompletionResponse:
+        """
+        Subroutine to convert the non-streaming output of vLLM's OpenAI-compatible API into an
+        equivalent Llama Stack object.
+
+        The result from vLLM's non-streaming API is a dataclass with the same name as the Llama
+        Stack ChatCompletionResponse dataclass, but with more and different field names. We ignore
+        the fields that aren't currently present in the Llama Stack dataclass.
+        """
+
+        # There may be multiple responses, but we can only pass through the first one.
+        if len(vllm_result.choices) == 0:
+            raise ValueError("Don't know how to convert response object without any responses")
+        vllm_message = vllm_result.choices[0].message
+        vllm_finish_reason = vllm_result.choices[0].finish_reason
+
+        converted_message = CompletionMessage(
+            role=vllm_message.role,
+            # Llama Stack API won't accept None for content field.
+            content=("" if vllm_message.content is None else vllm_message.content),
+            stop_reason=get_stop_reason(vllm_finish_reason),
+            tool_calls=[
+                ToolCall(
+                    call_id=t.id,
+                    tool_name=t.function.name,
+                    # vLLM function args come back as a string. Llama Stack expects JSON.
+                    arguments=json.loads(t.function.arguments),
+                )
+                for t in vllm_message.tool_calls
+            ],
+        )
+
+        # TODO: Convert logprobs
+
+        logger.debug(f"Converted message: {converted_message}")
+
+        return ChatCompletionResponse(
+            completion_message=converted_message,
+        )
+
+    async def _chat_completion_for_meta_llama(
+        self, request: ChatCompletionRequest
+    ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
+        """
+        Subroutine that routes chat completions for Meta Llama models through Llama Stack's
+        chat template instead of using vLLM's version of that template. The Llama Stack version
+        of the chat template currently produces more reliable outputs.
+
+        Once vLLM's support for Meta Llama models has matured more, we should consider routing
+        Meta Llama requests through the vLLM chat completions API instead of using this method.
+        """
+        formatter = ChatFormat(Tokenizer.get_instance())
+
+        # Note that this function call modifies `request` in place.
+        prompt = await chat_completion_request_to_prompt(request, self.resolved_model_id)
+
+        model_id = list(self.model_ids)[0]  # Any model ID will do here
+        completion_response_or_iterator = await self.completion(
+            model_id=model_id,
+            content=prompt,
+            sampling_params=request.sampling_params,
+            response_format=request.response_format,
+            stream=request.stream,
+            logprobs=request.logprobs,
+        )
+
+        if request.stream:
+            if not isinstance(completion_response_or_iterator, AsyncIterator):
+                raise TypeError(
+                    f"Received unexpected result type {type(completion_response_or_iterator)}for streaming request."
+                )
+            return self._chat_completion_for_meta_llama_streaming(completion_response_or_iterator, request)
+
+        # elsif not request.stream:
+        if not isinstance(completion_response_or_iterator, CompletionResponse):
+            raise TypeError(
+                f"Received unexpected result type {type(completion_response_or_iterator)}for non-streaming request."
+            )
+        completion_response: CompletionResponse = completion_response_or_iterator
+        raw_message = formatter.decode_assistant_message_from_content(
+            completion_response.content, completion_response.stop_reason
+        )
+        return ChatCompletionResponse(
+            completion_message=CompletionMessage(
+                content=raw_message.content,
+                stop_reason=raw_message.stop_reason,
+                tool_calls=raw_message.tool_calls,
+            ),
+            logprobs=completion_response.logprobs,
+        )
+
+    async def _chat_completion_for_meta_llama_streaming(
+        self, results_iterator: AsyncIterator, request: ChatCompletionRequest
+    ) -> AsyncIterator:
+        """
+        Code from :func:`_chat_completion_for_meta_llama()` that needs to be a separate
+        method to keep asyncio happy.
+        """
+
+        # Convert to OpenAI format, then use shared code to convert to Llama Stack format.
+        async def _generate_and_convert_to_openai_compat():
+            chunk: CompletionResponseStreamChunk  # Make Pylance happy
+            last_text_len = 0
+            async for chunk in results_iterator:
+                if chunk.stop_reason == StopReason.end_of_turn:
+                    finish_reason = "stop"
+                elif chunk.stop_reason == StopReason.end_of_message:
+                    finish_reason = "eos"
+                elif chunk.stop_reason == StopReason.out_of_tokens:
+                    finish_reason = "length"
+                else:
+                    finish_reason = None
+
+                # Convert delta back to an actual delta
+                text_delta = chunk.delta[last_text_len:]
+                last_text_len = len(chunk.delta)
+
+                logger.debug(f"{text_delta=}; {finish_reason=}")
+
+                yield OpenAICompatCompletionResponse(
+                    choices=[OpenAICompatCompletionChoice(finish_reason=finish_reason, text=text_delta)]
+                )
+
+        stream = _generate_and_convert_to_openai_compat()
+        async for chunk in process_chat_completion_stream_response(stream, request):
+            logger.debug(f"Returning chunk: {chunk}")
+            yield chunk
+
+    async def _convert_streaming_results(self, vllm_result: AsyncIterator) -> AsyncIterator:
+        """
+        Subroutine that wraps the streaming outputs of vLLM's OpenAI-compatible
+        API into a second async iterator that returns Llama Stack objects.
+
+        :param vllm_result: Stream of strings that need to be parsed
+        """
+        # Tool calls come in pieces, but Llama Stack expects them in bigger chunks. We build up
+        # those chunks and output them at the end.
+        # This data structure holds the current set of partial tool calls.
+        index_to_tool_call: Dict[int, Dict] = dict()
+
+        # The Llama Stack event stream must always start with a start event. Use an empty one to
+        # simplify logic below
+        yield ChatCompletionResponseStreamChunk(
+            event=ChatCompletionResponseEvent(
+                event_type=ChatCompletionResponseEventType.start,
+                delta=TextDelta(text=""),
+                stop_reason=None,
+            )
+        )
+
+        converted_stop_reason = None
+        async for chunk_str in vllm_result:
+            # Due to OpenAI compatibility, each event in the stream will start with "data: " and
+            # end with "\n\n".
+            _prefix = "data: "
+            _suffix = "\n\n"
+            if not chunk_str.startswith(_prefix) or not chunk_str.endswith(_suffix):
+                raise ValueError(f"Can't parse result string from vLLM: '{re.escape(chunk_str)}'")
+
+            # In between the "data: " and newlines is an event record
+            data_str = chunk_str[len(_prefix) : -len(_suffix)]
+
+            # The end of the stream is indicated with "[DONE]"
+            if data_str == "[DONE]":
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.complete,
+                        delta=TextDelta(text=""),
+                        stop_reason=converted_stop_reason,
+                    )
+                )
+                return
+
+            # Anything that is not "[DONE]" should be a JSON record
+            parsed_chunk = json.loads(data_str)
+
+            logger.debug(f"Parsed JSON event to:\n{json.dumps(parsed_chunk, indent=2)}")
+
+            # The result may contain multiple completions, but Llama Stack APIs only support
+            # returning one.
+            first_choice = parsed_chunk["choices"][0]
+            converted_stop_reason = get_stop_reason(first_choice["finish_reason"])
+            delta_record = first_choice["delta"]
+
+            if "content" in delta_record:
+                # Text delta
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=TextDelta(text=delta_record["content"]),
+                        stop_reason=converted_stop_reason,
+                    )
+                )
+            elif "tool_calls" in delta_record:
+                # Tool call(s). Llama Stack APIs do not have a clear way to return partial tool
+                # calls, so buffer until we get a "tool calls" stop reason
+                for tc in delta_record["tool_calls"]:
+                    index = tc["index"]
+                    if index not in index_to_tool_call:
+                        # First time this tool call is showing up
+                        index_to_tool_call[index] = dict()
+                    tool_call = index_to_tool_call[index]
+                    if "id" in tc:
+                        tool_call["call_id"] = tc["id"]
+                    if "function" in tc:
+                        if "name" in tc["function"]:
+                            tool_call["tool_name"] = tc["function"]["name"]
+                        if "arguments" in tc["function"]:
+                            # Arguments comes in as pieces of a string
+                            if "arguments_str" not in tool_call:
+                                tool_call["arguments_str"] = ""
+                            tool_call["arguments_str"] += tc["function"]["arguments"]
+            else:
+                raise ValueError(f"Don't know how to parse event delta: {delta_record}")
+
+            if first_choice["finish_reason"] == "tool_calls":
+                # Special OpenAI code for "tool calls complete".
+                # Output the buffered tool calls. Llama Stack requires a separate event per tool
+                # call.
+                for tool_call_record in index_to_tool_call.values():
+                    # Arguments come in as a string. Parse the completed string.
+                    tool_call_record["arguments"] = json.loads(tool_call_record["arguments_str"])
+                    del tool_call_record["arguments_str"]
+
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=ToolCallDelta(tool_call=tool_call_record, parse_status="succeeded"),
+                            stop_reason=converted_stop_reason,
+                        )
+                    )
+
+        # If we get here, we've lost the connection with the vLLM event stream before it ended
+        # normally.
+        raise ValueError("vLLM event stream ended without [DONE] message.")
diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml
index cdce5510d..8a15ff016 100644
--- a/llama_stack/templates/vllm-gpu/run.yaml
+++ b/llama_stack/templates/vllm-gpu/run.yaml
@@ -15,11 +15,12 @@ providers:
   - provider_id: vllm
     provider_type: inline::vllm
     config:
-      model: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
       tensor_parallel_size: ${env.TENSOR_PARALLEL_SIZE:1}
       max_tokens: ${env.MAX_TOKENS:4096}
+      max_model_len: ${env.MAX_MODEL_LEN:4096}
+      max_num_seqs: ${env.MAX_NUM_SEQS:4}
       enforce_eager: ${env.ENFORCE_EAGER:False}
-      gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:0.7}
+      gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:0.3}
   - provider_id: sentence-transformers
     provider_type: inline::sentence-transformers
     config: {}

From a1cdace09304e9c7037a0d5da91524b13d0e634f Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Fri, 7 Mar 2025 13:39:26 -0800
Subject: [PATCH 24/58] test: image downloading is flaky (#1491)

Summary:

Test Plan:
---
 tests/integration/inference/test_vision_inference.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/integration/inference/test_vision_inference.py b/tests/integration/inference/test_vision_inference.py
index 6029a8c72..984e563d7 100644
--- a/tests/integration/inference/test_vision_inference.py
+++ b/tests/integration/inference/test_vision_inference.py
@@ -27,6 +27,7 @@ def base64_image_url(base64_image_data, image_path):
     return f"data:image/{image_path.suffix[1:]};base64,{base64_image_data}"
 
 
+@pytest.mark.xfail(reason="This test is failing because the image is not being downloaded correctly.")
 def test_image_chat_completion_non_streaming(client_with_models, vision_model_id):
     message = {
         "role": "user",
@@ -55,6 +56,7 @@ def test_image_chat_completion_non_streaming(client_with_models, vision_model_id
     assert any(expected in message_content for expected in {"dog", "puppy", "pup"})
 
 
+@pytest.mark.xfail(reason="This test is failing because the image is not being downloaded correctly.")
 def test_image_chat_completion_streaming(client_with_models, vision_model_id):
     message = {
         "role": "user",

From b0cc38b269f61621dcba52ef9efda6fd2f88e11b Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Fri, 7 Mar 2025 13:45:25 -0800
Subject: [PATCH 25/58] test: fix recordable mocks cache key (#1492)

Summary:

CI writes files to /tmp

[{"__module__": "llama_stack.apis.inference.inference", "__pydantic__":
"SystemMessage", "data": {"content": "You are a helpful assistant",
"role": "system"}}, {"__module__":
"llama_stack.apis.inference.inference", "__pydantic__": "UserMessage",
"data": {"content": "Here is a csv file, can you describe it?",
"context": null, "role": "user"}}, {"__module__":
"llama_stack.apis.inference.inference", "__pydantic__":
"ToolResponseMessage", "data": {"call_id": "", "content": [{"text": "#
User provided a file accessible to you at
\\"/tmp/tmp7k7dg6qk/gcDtT5M8inflation.csv\\"\\nYou can use
code_interpreter to load and inspect it.", "type": "text"}], "role":
"tool", "tool_name": {"__enum__": "BuiltinTool", "__module__":
"llama_stack.models.llama.datatypes", "value": "code_interpreter"}}}]],
{"response_format": null, "sa

Test Plan:
---
 tests/integration/fixtures/recordable_mock.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/integration/fixtures/recordable_mock.py b/tests/integration/fixtures/recordable_mock.py
index d71426336..632d5b3ef 100644
--- a/tests/integration/fixtures/recordable_mock.py
+++ b/tests/integration/fixtures/recordable_mock.py
@@ -121,6 +121,9 @@ class RecordableMock:
         # Replace temporary file paths created by tempfile.mkdtemp()
         key = re.sub(r"/var/folders/[^,'\"\s]+", "<TEMP_FILE>", key)
 
+        # Replace /tmp/ paths which are also commonly used for temporary files
+        key = re.sub(r"/tmp/[^,'\"\s]+", "<TEMP_FILE>", key)
+
         return key
 
     def _save_cache(self):

From 3b4f3a6b15c1c1b21d9daa444385da764fce5487 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Fri, 7 Mar 2025 13:58:38 -0800
Subject: [PATCH 26/58] test: update recorded fixtures (#1493)

Summary:

Test Plan:
---
 .../recorded_responses/chat_completion.json   | 3563 ++++++++++++++---
 .../recorded_responses/invoke_tool.json       |   55 +-
 2 files changed, 3058 insertions(+), 560 deletions(-)

diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.json b/tests/integration/fixtures/recorded_responses/chat_completion.json
index db45bbdf7..7234b6c31 100644
--- a/tests/integration/fixtures/recorded_responses/chat_completion.json
+++ b/tests/integration/fixtures/recorded_responses/chat_completion.json
@@ -12500,27 +12500,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " boiling point of polyjuice is -100",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " degrees Fahrenheit.",
+              "text": " boiling point of polyjuice is -100 degrees Fahrenheit.",
               "type": "text"
             },
             "event_type": {
@@ -12609,7 +12589,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "type\": \"function\", \"name\": \"get_boiling",
+              "text": "type\": \"function\", \"name\":",
               "type": "text"
             },
             "event_type": {
@@ -12629,7 +12609,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "_point\", \"parameters\": {\"liquid_name",
+              "text": " \"get_boiling_point\", \"parameters\":",
               "type": "text"
             },
             "event_type": {
@@ -12649,7 +12629,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "\": \"polyjuice\", \"cel",
+              "text": " {\"liquid_name\": \"polyjuice",
               "type": "text"
             },
             "event_type": {
@@ -12669,7 +12649,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "cius\": \"false\"}}",
+              "text": "\", \"celcius\": \"false\"}}",
               "type": "text"
             },
             "event_type": {
@@ -12699,7 +12679,7 @@
                   "celcius": "false",
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "e8500d03-6e74-427c-b295-77bceca074f0",
+                "call_id": "bffe07d7-343f-49c4-bcff-d83c99fa7d4a",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -12794,7 +12774,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "    \"type\": \"function\",\n   ",
+              "text": "    \"type\": \"function\",\n    \"name\": \"get",
               "type": "text"
             },
             "event_type": {
@@ -12814,7 +12794,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " \"name\": \"get_boiling_point\",\n",
+              "text": "_boiling_point\",\n    \"parameters\": {\n        \"liquid_name",
               "type": "text"
             },
             "event_type": {
@@ -12834,7 +12814,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "    \"parameters\": {\n        \"liquid",
+              "text": "\": \"polyjuice\",\n       ",
               "type": "text"
             },
             "event_type": {
@@ -12854,27 +12834,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "_name\": \"polyjuice\",\n        \"celcius",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "\": \"true\"\n    }\n}",
+              "text": " \"celcius\": \"true\"\n    }\n}",
               "type": "text"
             },
             "event_type": {
@@ -12904,7 +12864,7 @@
                   "celcius": "true",
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "ee7ca410-7953-407c-a479-09067389fa5c",
+                "call_id": "41ce6bfb-81c1-438d-8520-329c4446f1bc",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -13187,7 +13147,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"",
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point",
               "type": "tool_call"
             },
             "event_type": {
@@ -13212,7 +13172,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "name\": \"get_boiling_point\", \"parameters",
+              "tool_call": "\", \"parameters\": {\"liquid_name\": \"polyjuice\", \"cel",
               "type": "tool_call"
             },
             "event_type": {
@@ -13237,32 +13197,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\": {\"liquid_name\": \"polyjuice\", \"celcius\": \"true",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "\"}}",
+              "tool_call": "cius\": \"true\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -13292,7 +13227,7 @@
                   "celcius": "true",
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "f8adc867-71c3-472a-9f2b-95cd34c9f174",
+                "call_id": "6161b956-9b68-4e88-87bf-e26a07d4c7ca",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -13397,7 +13332,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point_with",
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
               "type": "tool_call"
             },
             "event_type": {
@@ -13422,7 +13357,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "_metadata\", \"parameters\": {\"liquid_name\": \"polyjuice\", \"",
+              "tool_call": "_point_with_metadata\", \"parameters\": {\"liquid_name\": \"polyju",
               "type": "tool_call"
             },
             "event_type": {
@@ -13447,7 +13382,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "celcius\": \"true\"}}",
+              "tool_call": "ice\", \"celcius\": \"true\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -13477,7 +13412,7 @@
                   "celcius": "true",
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "df18472c-42eb-4ded-8e84-e0b79159219a",
+                "call_id": "11da4a37-d7ad-468a-98c8-0f1e295d14a9",
                 "tool_name": "get_boiling_point_with_metadata"
               },
               "type": "tool_call"
@@ -14070,7 +14005,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "'m unable to run the code as I'm missing the `b",
+              "text": "'m unable to access the file you provided",
               "type": "text"
             },
             "event_type": {
@@ -14090,7 +14025,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "wrap.core` module. However, I can provide a general solution",
+              "text": ". However, I",
               "type": "text"
             },
             "event_type": {
@@ -14110,7 +14045,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " for you.\n\nTo describe a CSV",
+              "text": " can suggest how you can describe the CSV file using the pandas library in Python.\n\nYou can use the `head()`, `dtypes`, and `describe()` functions to get an overview of the CSV file",
               "type": "text"
             },
             "event_type": {
@@ -14130,7 +14065,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " file, you can use the `pandas` library in Python.",
+              "text": ".\n\n- `head()`: This function prints the first few rows of the",
               "type": "text"
             },
             "event_type": {
@@ -14150,7 +14085,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " Here's a general solution:\n\n1.",
+              "text": " dataframe, giving you an idea of what the",
               "type": "text"
             },
             "event_type": {
@@ -14170,7 +14105,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " Import the `pandas` library.\n2. Load the",
+              "text": " data looks like.\n- `dtypes`: This",
               "type": "text"
             },
             "event_type": {
@@ -14190,7 +14125,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " CSV file using `pd.read_csv()`.\n",
+              "text": " function prints the data types of each column in the",
               "type": "text"
             },
             "event_type": {
@@ -14210,7 +14145,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "3. Print the first few rows of the dataframe using `df",
+              "text": " dataframe.\n- `describe()`: This function prints summary",
               "type": "text"
             },
             "event_type": {
@@ -14230,7 +14165,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": ".head()`.\n4. Print the data types of each",
+              "text": " statistics of the dataframe, including mean, standard deviation, minimum, maximum,",
               "type": "text"
             },
             "event_type": {
@@ -14250,7 +14185,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " column using `df.dtypes`.\n5. Print the summary",
+              "text": " and quartiles for numeric columns, and count and unique values for",
               "type": "text"
             },
             "event_type": {
@@ -14270,7 +14205,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " statistics of the dataframe using `df.describe()`.\n\nThis will give",
+              "text": " object columns.\n\nIf you want to get more information about the CSV file,",
               "type": "text"
             },
             "event_type": {
@@ -14290,7 +14225,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " you a general idea of what the CSV file contains. If you",
+              "text": " you can use the `info()` function, which prints a concise summary",
               "type": "text"
             },
             "event_type": {
@@ -14310,7 +14245,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " need more specific information, please let me know and I'll be",
+              "text": " of the dataframe, including the index dtype and column dtypes, non-",
               "type": "text"
             },
             "event_type": {
@@ -14330,7 +14265,27 @@
         "data": {
           "event": {
             "delta": {
-              "text": " happy to help.",
+              "text": "nullable values, and memory usage.\n\nPlease make sure the file is in the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " correct format and is accessible to the Python script.",
               "type": "text"
             },
             "event_type": {
@@ -14365,59 +14320,356 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "uKno8S5o",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:19.978994+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "qchwuhR3TlCRLUu5",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 355
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"<TEMP_FILE>\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(code_interpreter.get_file_path(\\\"<TEMP_FILE>\"))\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
             },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "uKno8S5o",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:19.979047+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "qchwuhR3TlCRLUu5",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 166
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
             },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "uKno8S5o",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:19.979054+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "qchwuhR3TlCRLUu5",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 521
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " error message indicates that the `bwrap.core` module is not found",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ". This is likely because the `bwrap` library is not installed",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ". To fix this, you can install the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " `bwrap` library using pip:\n\n```\npip install",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " bwrap\n```\n\nIf you are still facing issues",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ", you can try to use the `code_interpreter.get_file_path",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "()` function to load the CSV file directly, as shown in the corrected",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " code above.\n\nAlternatively, if you don't have access to the `code",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "_interpreter` library, you can use the `pandas.read_csv",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "()` function with the file path as a string:\n\n```\ndf = pd",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".read_csv(\"/var/folders/cz/vyh7y1d11",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "xg881lsxsshnc5c0000gn/T/tmp4ed",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "7p2bg/Csr659svinflation.csv\")\n```\n\nThis",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " should load the CSV file and allow you to inspect its contents.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
             }
-          ]
+          },
+          "metrics": null
         }
       }
     ],
@@ -14481,7 +14733,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "import pandas as pd\nimport code_interpreter\n\n# Load the",
+              "tool_call": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file",
               "type": "tool_call"
             },
             "event_type": {
@@ -14506,7 +14758,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " CSV file\ndf = pd.read_csv(\"/var/folders/c",
+              "tool_call": "\ndf = pd.read_csv(code_interpreter.get_file_path(\"/var",
               "type": "tool_call"
             },
             "event_type": {
@@ -14531,7 +14783,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "z/vyh7y1d11xg881lsxssh",
+              "tool_call": "/folders/cz/vyh7y1",
               "type": "tool_call"
             },
             "event_type": {
@@ -14556,7 +14808,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "nc5c0000gn/T/tmplr_wf0lb",
+              "tool_call": "d11xg881lsxsshnc5c0000gn/T",
               "type": "tool_call"
             },
             "event_type": {
@@ -14581,7 +14833,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "/Pl4Pewubinflation.csv\")\n\n# Print the first few",
+              "tool_call": "/tmp4ed7p2bg/Csr659svinflation.csv\"))\n\n",
               "type": "tool_call"
             },
             "event_type": {
@@ -14606,7 +14858,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " rows of the dataframe\nprint(df.head())\n\n# Print the data types of",
+              "tool_call": "# Print the first few rows of the dataframe\nprint(df.head",
               "type": "tool_call"
             },
             "event_type": {
@@ -14631,7 +14883,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " each column\nprint(df.dtypes)\n\n# Print the summary statistics of the",
+              "tool_call": "())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n",
               "type": "tool_call"
             },
             "event_type": {
@@ -14656,7 +14908,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " dataframe\nprint(df.describe())",
+              "tool_call": "# Print the summary statistics of the dataframe\nprint(df.describe())",
               "type": "tool_call"
             },
             "event_type": {
@@ -14683,9 +14935,9 @@
               },
               "tool_call": {
                 "arguments": {
-                  "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmplr_wf0lb/Pl4Pewubinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())"
+                  "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(code_interpreter.get_file_path(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp4ed7p2bg/Csr659svinflation.csv\"))\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())"
                 },
-                "call_id": "40ed30d4-05c7-4a7f-93b0-e1e6e43e48de",
+                "call_id": "c5d0fce3-d7c6-4da1-89e4-e727df42f356",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -14730,59 +14982,7 @@
               "value": "end_of_turn"
             }
           },
-          "metrics": [
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "prompt_tokens",
-              "span_id": "sz886Glf",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:18.831808+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "qchwuhR3TlCRLUu5",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 196
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "completion_tokens",
-              "span_id": "sz886Glf",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:18.831870+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "qchwuhR3TlCRLUu5",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 10
-            },
-            {
-              "attributes": {
-                "model_id": "meta-llama/Llama-3.1-8B-Instruct",
-                "provider_id": "fireworks"
-              },
-              "metric": "total_tokens",
-              "span_id": "sz886Glf",
-              "timestamp": {
-                "__class__": "datetime",
-                "__datetime__": "2025-03-06T04:49:18.831879+00:00",
-                "__module__": "datetime"
-              },
-              "trace_id": "qchwuhR3TlCRLUu5",
-              "type": "metric",
-              "unit": "tokens",
-              "value": 206
-            }
-          ]
+          "metrics": null
         }
       }
     ],
@@ -14846,7 +15046,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "import pandas as pd\ndf = pd.read_csv(\"/var/f",
+              "tool_call": "import pandas as pd\n",
               "type": "tool_call"
             },
             "event_type": {
@@ -14871,7 +15071,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "olders/cz/vyh7y1d11xg881",
+              "tool_call": "import code_interpreter\n\n# Load the CSV file\ndf =",
               "type": "tool_call"
             },
             "event_type": {
@@ -14896,7 +15096,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "lsxsshnc5c0000gn/T/tmpeip",
+              "tool_call": " pd.read_csv(\"/var/folders",
               "type": "tool_call"
             },
             "event_type": {
@@ -14921,7 +15121,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "ex0j0/b807hgTQinflation.csv\")\n",
+              "tool_call": "/cz/vyh7y1d11xg881",
               "type": "tool_call"
             },
             "event_type": {
@@ -14946,7 +15146,132 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "print(df.head())",
+              "tool_call": "lsxsshnc5c0000gn/T/tmp4ed7",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "p2bg/Csr659svinflation.csv\")\n\n# Print",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " the first few rows of the dataframe\nprint(df.head())\n\n#",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " Print the data types of each column\nprint(df.dtypes)\n\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "# Print the summary statistics of the dataframe",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\nprint(df.describe())",
               "type": "tool_call"
             },
             "event_type": {
@@ -14973,9 +15298,9 @@
               },
               "tool_call": {
                 "arguments": {
-                  "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpeipex0j0/b807hgTQinflation.csv\")\nprint(df.head())"
+                  "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp4ed7p2bg/Csr659svinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())"
                 },
-                "call_id": "d431c3a2-5b91-4407-8323-27bc134503e0",
+                "call_id": "8aeab20b-341b-4349-84dc-3e3c3299d713",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -15748,6 +16073,638 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If the file is too large to be uploaded, you can provide a sample of the csv file and I can help you describe it. \\n\\nHere is an example of how you can describe a csv file using pandas:\\n\\n```\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n# Print the first 5 rows of the data\\nprint(df.head())\\n# Print the last 5 rows of the data\\nprint(df.tail())\\n# Print the summary statistics of the data\\nprint(df.describe())\\n# Print the data types of each column\\nprint(df.dtypes)\\n# Print the number of missing values in each column\\nprint(df.isnull().sum())\\n```\\n\\nThis will give you an idea of what the csv file contains.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert 'date' column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot the time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "This",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " code will create a line plot of the average yearly inflation over time. The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " x-axis represents the year and the y",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "-axis represents the average inflation. Each point on the plot represents",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the average inflation for a particular year.\n\nPlease note that you need",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " to replace 'inflation.csv' with the actual path",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " to your csv file. Also, this code assumes that the csv file",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " has a column named 'date' and another column named 'inflation",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'. If your csv file has different column names",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ", you need to replace 'date' and 'inflation'",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " with the actual column names.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If the file is too large to be uploaded, you can provide a sample of the csv file and I can help you describe it. \\n\\nHere is an example of how you can describe a csv file using pandas:\\n\\n```\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n# Print the first 5 rows of the data\\nprint(df.head())\\n# Print the last 5 rows of the data\\nprint(df.tail())\\n# Print the summary statistics of the data\\nprint(df.describe())\\n# Print the data types of each column\\nprint(df.dtypes)\\n# Print the number of missing values in each column\\nprint(df.isnull().sum())\\n```\\n\\nThis will give you an idea of what the csv file contains.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "df = pd.read_csv('inflation.csv')\n\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "# Convert 'date' column to datetime\ndf['date']",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " = pd.to_datetime(df['date'])\n\n#",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " Group by year and calculate average inflation\naverage_inflation = df",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ".groupby(df['date'].dt.year)['inflation'].mean()\n\n#",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " Plot the time series\nplt.figure(figsize=(10,6))\nplt",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ".plot(average_inflation.index, average_inflation.values, marker='o",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "plt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv('inflation.csv')\n\n# Convert 'date' column to datetime\ndf['date'] = pd.to_datetime(df['date'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\n\n# Plot the time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()"
+                },
+                "call_id": "91ad7e4c-2e89-4cb5-9d0b-753ceafb7eab",
+                "tool_name": {
+                  "__enum__": "BuiltinTool",
+                  "__module__": "llama_stack.models.llama.datatypes",
+                  "value": "code_interpreter"
+                }
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"<TEMP_FILE>\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"<TEMP_FILE>\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a local file, you can use the `load_data` function from the `code_interpreter` library to load the file. \\n\\nHere is an example of how you can describe the csv file:\\n\\n```\\nimport pandas as pd\\nfrom code_interpreter import load_data\\n\\n# Load data\\ndf = load_data('inflation.csv')\\n\\n# Print summary of the data\\nprint(df.head())  # Print the first few rows of the data\\nprint(df.info())  # Print information about the data\\nprint(df.describe())  # Print summary statistics about the data\\n```\\n\\nPlease replace 'inflation.csv' with your actual csv file name. \\n\\nIf you are using a remote file, you need to provide the actual file path or the file itself. \\n\\nAlso, make sure that the file is in the correct format and that the pandas library can read it correctly.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"inflation.csv\\\")\\n\\n# Convert date column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n  line 5, in <module>\\n    from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": {
     "chunks": [
       {
@@ -16702,7 +17659,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " seems that the file \"/var/folders/cz/vyh7y1",
+              "text": " seems that the file \"/var/folders",
               "type": "text"
             },
             "event_type": {
@@ -16722,7 +17679,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "d11xg881lsxsshnc5c0000gn/T/t",
+              "text": "/cz/vyh7y1d11xg881",
               "type": "text"
             },
             "event_type": {
@@ -16742,7 +17699,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "mpr3640a7b/Y5UaJew2inflation",
+              "text": "lsxsshnc5c0000gn/T/tmp4ed7",
               "type": "text"
             },
             "event_type": {
@@ -16762,7 +17719,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": ".csv\" does not exist. \n\nTo describe the csv file, you need",
+              "text": "p2bg/UZ0Z335vinflation.csv\" does",
               "type": "text"
             },
             "event_type": {
@@ -16782,7 +17739,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " to provide the actual file path or the file itself. If the file is",
+              "text": " not exist. \n\nTo describe the csv file, you need to",
               "type": "text"
             },
             "event_type": {
@@ -16802,7 +17759,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " in your current directory, you can use the following code:\n\n```python\n",
+              "text": " provide the actual file path or the file itself. If the file",
               "type": "text"
             },
             "event_type": {
@@ -16822,7 +17779,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "import pandas as pd\n# Load data\n",
+              "text": " is too large to be uploaded, you can provide a sample",
               "type": "text"
             },
             "event_type": {
@@ -16842,7 +17799,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "df = pd.read_csv('inflation.csv')\n# Print",
+              "text": " of the csv file and I can help you describe it. \n\nHere is",
               "type": "text"
             },
             "event_type": {
@@ -16862,7 +17819,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " the first 5 rows of the dataframe\nprint(df.head())\n# Print the",
+              "text": " an example of how you can describe a",
               "type": "text"
             },
             "event_type": {
@@ -16882,7 +17839,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " summary of the dataframe\nprint(df.info())\nprint(df.describe())\n```\n\n",
+              "text": " csv file using pandas:\n\n```\nimport pandas as pd\n#",
               "type": "text"
             },
             "event_type": {
@@ -16902,7 +17859,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "This will print the first 5 rows of the dataframe, the summary of",
+              "text": " Load data\ndf = pd.read_csv('",
               "type": "text"
             },
             "event_type": {
@@ -16922,7 +17879,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " the dataframe (including the index dtype and column count), and the description of",
+              "text": "inflation.csv')\n# Print the first 5 rows of the",
               "type": "text"
             },
             "event_type": {
@@ -16942,7 +17899,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " the dataframe (including count, mean, std, min, 25%,",
+              "text": " data\nprint(df.head())\n# Print the last 5 rows of the",
               "type": "text"
             },
             "event_type": {
@@ -16962,7 +17919,87 @@
         "data": {
           "event": {
             "delta": {
-              "text": " 50%, 75%, max for each column).",
+              "text": " data\nprint(df.tail())\n# Print the summary statistics of the data\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "print(df.describe())\n# Print the data types of each column\nprint(df",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".dtypes)\n# Print the number of missing values in each column\nprint",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "(df.isnull().sum())\n```\n\nThis will give you an idea of",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " what the csv file contains.",
               "type": "text"
             },
             "event_type": {
@@ -17061,7 +18098,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/",
+              "tool_call": "import pandas as pd\n# Load data\ndf = pd.read",
               "type": "tool_call"
             },
             "event_type": {
@@ -17086,7 +18123,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "var/folders/cz/vyh7y1d11xg881",
+              "tool_call": "_csv(\"/var/folders/cz/vyh7y1d",
               "type": "tool_call"
             },
             "event_type": {
@@ -17111,7 +18148,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "lsxsshnc5c0000gn",
+              "tool_call": "11xg881lsxsshnc",
               "type": "tool_call"
             },
             "event_type": {
@@ -17136,7 +18173,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "/T/tmpr3640a7b",
+              "tool_call": "5c0000gn/T/tmp4ed7p2bg/U",
               "type": "tool_call"
             },
             "event_type": {
@@ -17161,7 +18198,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "/Y5UaJew2",
+              "tool_call": "Z0Z335vinflation.csv\")\n# Rows\nprint(\"",
               "type": "tool_call"
             },
             "event_type": {
@@ -17186,7 +18223,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "inflation.csv\")\n# Rows\nprint(\"",
+              "tool_call": "Number of rows and columns in the data:\", df.shape)\n# Columns",
               "type": "tool_call"
             },
             "event_type": {
@@ -17211,7 +18248,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "Number of rows and columns in the",
+              "tool_call": "\nprint(\"Columns of the data are:\", len(df.columns))\n# Column",
               "type": "tool_call"
             },
             "event_type": {
@@ -17236,7 +18273,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " data:\", df.shape)\n# Columns\nprint",
+              "tool_call": " names\nprint(\"Columns of the data are:\", df.columns)\n# Column",
               "type": "tool_call"
             },
             "event_type": {
@@ -17261,7 +18298,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "(\"Columns of the data are:\", len",
+              "tool_call": " dtypes\nprint(\"Datatype of the columns are:\", df.dtypes",
               "type": "tool_call"
             },
             "event_type": {
@@ -17286,82 +18323,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "(df.columns))\n# Column names\nprint(\"",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "Columns of the data are:\", df.columns)\n# Column dtypes\n",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "print(\"Datatype of the columns are:\",",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": " df.dtypes)",
+              "tool_call": ")",
               "type": "tool_call"
             },
             "event_type": {
@@ -17388,9 +18350,9 @@
               },
               "tool_call": {
                 "arguments": {
-                  "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpr3640a7b/Y5UaJew2inflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)"
+                  "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp4ed7p2bg/UZ0Z335vinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)"
                 },
-                "call_id": "c18dbae3-9ce0-4914-8062-20a3987959e4",
+                "call_id": "98e27ff4-d4d7-4764-9213-f46bb928ec68",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -17441,6 +18403,828 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:24443\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:961ff\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:b49f7\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:961ff\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:b49f7\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:961ff\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:961ff\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:961ff\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:961ff\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:961ff\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "To",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use LoRA in Torcht",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "une, you can follow these steps:\n\n1. ",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Install Torchtune and its dependencies.\n2.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "  Download the Llama2 weights and tokenizer.\n3",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".  Use the `lora_llama2",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "_7b`",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " model in Torchtune, which applies LoRA to the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Q and V projections",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " by default.\n4.  Set the `lora_at",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "tn_modules` argument to apply LoRA to",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " all linear layers in the self-attention.\n5.  Increase",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the rank and alpha values to experiment with different LoRA configurations",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".\n6.  Use the `lora",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "_finetune_distributed` recipe in Torcht",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "une to run a LoRA finetune with two",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " GPUs.\n7.  Modify the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " `lora_finetune_distributed` config",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " to apply LoRA to all linear layers in the self-",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "attention and increase the rank and alpha values.\n8.  Run",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the experiment using the modified config.\n\nBy",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " following these steps, you can use LoRA",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " in Torchtune to fine-tune a Llama2 model",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " with a low memory footprint and experiment with different LoRA configurations",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:24443\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:961ff\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:b49f7\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:961ff\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:b49f7\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters\":",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " {\"query\": \"How to use LoRA in Torchtune\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "How to use LoRA in Torchtune"
+                },
+                "call_id": "0d852474-6781-48ed-b8c1-778bd0f4e7f0",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:24443\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:961ff\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:b49f7\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:961ff\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:b49f7\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'m ready to help you answer questions about",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Torchtune based on the documentation you provided. What's your",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " first question?",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:2a4c4\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:d4e29\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:d68cc\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:d4e29\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:d68cc\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:d4e29\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:d4e29\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:d4e29\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:d4e29\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:d4e29\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
     "chunks": [
       {
@@ -19001,6 +20785,668 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a4c57\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:46132\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:392a8\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:46132\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:392a8\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:46132\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:46132\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:46132\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:46132\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:46132\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "To",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use LoRA in Torchtune, you can follow these steps",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ":\n\n1.  Install Torchtune and its dependencies.\n2",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".  Download the Llama2 weights and tokenizer.\n3. ",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Use the `lora_llama2_7b` model in",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Torchtune, which applies LoRA to the Q and V projections",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " by default.\n4.  Load the base model weights into the Lo",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "RA model without any conversion necessary.\n5.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "  Set only LoRA parameters to trainable.\n6.  Run",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the LoRA finetuning recipe in Torchtune with the desired",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " configuration.\n\nYou can also experiment with different LoRA configurations, such as",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " applying LoRA to all linear layers in the self-attention, increasing",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the rank, or scaling alpha and rank together.\n\nBy following these steps",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ", you can use LoRA in Torcht",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "une to fine-tune a Llama2 model with a low",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " memory footprint and achieve good performance.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a4c57\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:46132\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:392a8\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:46132\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:392a8\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\":",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " \"knowledge_search\", \"parameters\": {\"query\": \"How to",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use LoRA in Torchtune\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "How to use LoRA in Torchtune"
+                },
+                "call_id": "45ec3014-ff3f-4d0b-9649-30a299f7b9d4",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a4c57\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:46132\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:392a8\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:46132\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:392a8\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'m ready to help you answer",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " questions about Torchtune based on the documentation you provided.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " What's your first question?",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:b222e\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:1b69d\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:deca9\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:1b69d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:deca9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:1b69d\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:1b69d\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:1b69d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:1b69d\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:1b69d\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
     "chunks": [
       {
@@ -20664,32 +23110,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\": {\"query\": \"Torchtune documentation",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "\"}}",
+              "tool_call": "\": {\"query\": \"Torchtune documentation\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -20718,7 +23139,7 @@
                 "arguments": {
                   "query": "Torchtune documentation"
                 },
-                "call_id": "5c14ec34-3e33-4d90-b376-5086fed1c306",
+                "call_id": "5cfa4683-2147-41ab-9a44-a8b7f23e9f75",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -20813,27 +23234,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " attention type used by Llama3-8",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": "B is grouped-query attention.",
+              "text": " attention type used by Llama3-8B is grouped-query attention.",
               "type": "text"
             },
             "event_type": {
@@ -20922,7 +23323,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " attention type used by Llama3-8B",
+              "text": " attention type used by Llama3",
               "type": "text"
             },
             "event_type": {
@@ -20942,7 +23343,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " is grouped-query attention.",
+              "text": "-8B is grouped-query attention.",
               "type": "text"
             },
             "event_type": {
@@ -21031,7 +23432,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "    \"type\": \"function\",\n    \"name\": \"knowledge_search\",\n",
+              "text": "    \"type\": \"function\",\n   ",
               "type": "text"
             },
             "event_type": {
@@ -21051,7 +23452,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "    \"parameters\": {\n        \"",
+              "text": " \"name\": \"knowledge_search\",\n   ",
               "type": "text"
             },
             "event_type": {
@@ -21071,7 +23472,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "query\": \"Llama3-8B attention type\"\n    }\n",
+              "text": " \"parameters\": {\n        \"query",
               "type": "text"
             },
             "event_type": {
@@ -21091,7 +23492,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "}",
+              "text": "\": \"Llama3-8B attention type\"\n    }\n}",
               "type": "text"
             },
             "event_type": {
@@ -21120,7 +23521,7 @@
                 "arguments": {
                   "query": "Llama3-8B attention type"
                 },
-                "call_id": "caa1f5c4-6de8-4999-a22c-97ea4750d4aa",
+                "call_id": "b2d62231-df92-43ed-b51f-f7b8a4bc4b15",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -21225,7 +23626,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
+              "tool_call": "{\"type\": \"function\", \"name\":",
               "type": "tool_call"
             },
             "event_type": {
@@ -21250,7 +23651,32 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\": {\"query\": \"Llama3-8B attention type\"}}",
+              "tool_call": " \"knowledge_search\", \"parameters\": {\"query\": \"Llama",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "3-8B attention type\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -21279,7 +23705,7 @@
                 "arguments": {
                   "query": "Llama3-8B attention type"
                 },
-                "call_id": "3aab4108-2ae3-4d71-a27d-7beb09330752",
+                "call_id": "52c2b1ea-3695-4030-87a1-d0ca6d1056af",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -21616,6 +24042,95 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\\\", \\\"score\\\": 0.8342047, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79099923, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"The 11 People Running Meta's $1 Trillion Social Media and ... - Observer\\\", \\\"url\\\": \\\"https://observer.com/2024/01/meta-facebook-top-executives/\\\", \\\"content\\\": \\\"Meta has one of the most stable leadership team in the tech industry. Almost all of Meta's top executives have been with the company for well over a decade. ... 39, cofounder, chairman and CEO\\\", \\\"score\\\": 0.45536873, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Meta leadership: images of senior executives for download to use in articles about the company.\\\", \\\"score\\\": 0.21026355, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05564338, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " current CEO of Meta is Mark Zuckerberg.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
     "chunks": [
       {
@@ -21703,7 +24218,7 @@
                 "arguments": {
                   "query": "current CEO of Meta"
                 },
-                "call_id": "8e303404-99c1-4610-9e53-82440614bf51",
+                "call_id": "cc85a2df-6b2d-41c0-97dd-1509ca8061c4",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -21802,7 +24317,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " function `get_boiling_point` is not able to find the boiling point",
+              "text": " function `get_boiling_point` is not able to",
               "type": "text"
             },
             "event_type": {
@@ -21822,7 +24337,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " of polyjuice as it is a fictional liquid from the Harry Potter series",
+              "text": " find the boiling point of polyjuice as",
               "type": "text"
             },
             "event_type": {
@@ -21842,7 +24357,27 @@
         "data": {
           "event": {
             "delta": {
-              "text": ". The function is only able to find the boiling point of real liquids.",
+              "text": " it is a fictional liquid from the Harry Potter series. The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " function is only able to find the boiling point of real liquids.",
               "type": "text"
             },
             "event_type": {
@@ -22060,7 +24595,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " function `get_boiling_point` is not",
+              "text": " function `get_boiling_point` is not able to find the",
               "type": "text"
             },
             "event_type": {
@@ -22080,7 +24615,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " able to find the boiling point of polyjuice as it is",
+              "text": " boiling point of polyjuice as it is not a real liquid",
               "type": "text"
             },
             "event_type": {
@@ -22100,7 +24635,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " not a real liquid.",
+              "text": ".",
               "type": "text"
             },
             "event_type": {
@@ -22199,7 +24734,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point\",",
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
               "type": "tool_call"
             },
             "event_type": {
@@ -22224,7 +24759,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " \"parameters\": {\"liquid_name\": \"polyjuice",
+              "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"poly",
               "type": "tool_call"
             },
             "event_type": {
@@ -22249,7 +24784,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\"}}",
+              "tool_call": "juice\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -22278,7 +24813,7 @@
                 "arguments": {
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "3d4300a8-2093-458d-8195-3530acaea9e6",
+                "call_id": "83d9f330-4c7a-4dd3-8fcb-ccc5301c1f83",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -22383,7 +24918,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
+              "tool_call": "{\"type\": \"function\", \"name\":",
               "type": "tool_call"
             },
             "event_type": {
@@ -22408,7 +24943,32 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"polyjuice\"}}",
+              "tool_call": " \"get_boiling_point\", \"parameters\": {\"liquid_name\":",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " \"polyjuice\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -22437,7 +24997,7 @@
                 "arguments": {
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "da92286f-5b46-45e6-a2ae-a224279323c7",
+                "call_id": "98c63572-06c8-4cc0-a14e-3b10fb9ddc19",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -22532,7 +25092,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " couldn't find any information on the boiling point of Polyjuice. Polyju",
+              "text": " couldn't find any information on the boiling point of Polyjuice",
               "type": "text"
             },
             "event_type": {
@@ -22552,7 +25112,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "ice is a magical potion in the Harry Potter series that allows the drinker to",
+              "text": ". Polyjuice is a magical potion in the Harry Potter series",
               "type": "text"
             },
             "event_type": {
@@ -22572,7 +25132,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " transform into someone else. It's not a physical substance with a boiling point.",
+              "text": " that allows the drinker to transform into someone else. It's",
               "type": "text"
             },
             "event_type": {
@@ -22592,7 +25152,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " If you have any other questions, I'd",
+              "text": " not a physical substance with a boiling point. If you have any",
               "type": "text"
             },
             "event_type": {
@@ -22612,7 +25172,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " be happy to help.",
+              "text": " other questions, I'd be happy to help.",
               "type": "text"
             },
             "event_type": {
@@ -22711,7 +25271,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
+              "tool_call": "{\"type\": \"function\", \"name\":",
               "type": "tool_call"
             },
             "event_type": {
@@ -22736,7 +25296,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"",
+              "tool_call": " \"get_boiling_point\", \"parameters\": {\"liquid_name\":",
               "type": "tool_call"
             },
             "event_type": {
@@ -22761,7 +25321,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "polyjuice\"}}",
+              "tool_call": " \"polyjuice\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -22790,7 +25350,7 @@
                 "arguments": {
                   "liquid_name": "polyjuice"
                 },
-                "call_id": "afbebcb6-ec6b-4e08-99d5-4f92dc68d840",
+                "call_id": "cdccc866-97a0-40fd-b6e2-a0555f0ed921",
                 "tool_name": "get_boiling_point"
               },
               "type": "tool_call"
@@ -22984,7 +25544,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "def is_prime(n):\n    if n <= 1:\n        return False\n",
+              "tool_call": "def is_prime(n):\n    if n <= 1:\n       ",
               "type": "tool_call"
             },
             "event_type": {
@@ -23009,7 +25569,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "    if n <= 3:\n        return True\n    if n % ",
+              "tool_call": " return False\n    if n <= 3:\n        return True",
               "type": "tool_call"
             },
             "event_type": {
@@ -23034,7 +25594,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "2 == 0 or n % 3 ==",
+              "tool_call": "\n    if n % 2 == 0 or n % 3 == 0:\n        return False\n    i = 5\n    while i * i <= n:\n        if n % i == 0 or n % (i +",
               "type": "tool_call"
             },
             "event_type": {
@@ -23059,7 +25619,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " 0:\n        return False\n    i = 5\n   ",
+              "tool_call": " 2) == 0:\n            return False\n       ",
               "type": "tool_call"
             },
             "event_type": {
@@ -23084,7 +25644,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " while i * i <= n:\n        if",
+              "tool_call": " i += 6\n    return True\n\ndef get_nth_prime(n):\n    count",
               "type": "tool_call"
             },
             "event_type": {
@@ -23109,7 +25669,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " n % i == 0 or n % (i + ",
+              "tool_call": " = 0\n    num = 2\n    while True:\n        if",
               "type": "tool_call"
             },
             "event_type": {
@@ -23134,7 +25694,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "2) == 0:\n            return False\n        i",
+              "tool_call": " is_prime(num):\n            count += 1",
               "type": "tool_call"
             },
             "event_type": {
@@ -23159,7 +25719,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " += 6\n    return True\n\ndef get_nth_prime(n):\n    count =",
+              "tool_call": "\n            if count == n:\n                return num\n        num += ",
               "type": "tool_call"
             },
             "event_type": {
@@ -23184,82 +25744,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " 0\n    num = 2\n   ",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": " while True:\n        if is_prime(num):\n            count +=",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": " 1\n            if count == n:\n                return num\n        num +=",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": " 1\n\nprint(get_nth_prime(100))",
+              "tool_call": "1\n\nprint(get_nth_prime(100))",
               "type": "tool_call"
             },
             "event_type": {
@@ -23288,7 +25773,7 @@
                 "arguments": {
                   "code": "def is_prime(n):\n    if n <= 1:\n        return False\n    if n <= 3:\n        return True\n    if n % 2 == 0 or n % 3 == 0:\n        return False\n    i = 5\n    while i * i <= n:\n        if n % i == 0 or n % (i + 2) == 0:\n            return False\n        i += 6\n    return True\n\ndef get_nth_prime(n):\n    count = 0\n    num = 2\n    while True:\n        if is_prime(num):\n            count += 1\n            if count == n:\n                return num\n        num += 1\n\nprint(get_nth_prime(100))"
                 },
-                "call_id": "1d9ced32-c0fa-467b-9299-a4f38cf06926",
+                "call_id": "7fca0515-82f3-46e1-bbec-eceb8fa5162e",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -23387,7 +25872,27 @@
         "data": {
           "event": {
             "delta": {
-              "text": "plexity the company was founded in 2022.",
+              "text": "plexity the company was founded in 2022",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".",
               "type": "text"
             },
             "event_type": {
@@ -23476,7 +25981,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "type\": \"function\", \"name\": \"knowledge_search\", \"",
+              "text": "type\": \"function\", \"name\": \"knowledge_search\",",
               "type": "text"
             },
             "event_type": {
@@ -23496,7 +26001,27 @@
         "data": {
           "event": {
             "delta": {
-              "text": "parameters\": {\"query\": \"Perplexity company founding date\"}}",
+              "text": " \"parameters\": {\"query\": \"Perplexity company founding",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " date\"}}",
               "type": "text"
             },
             "event_type": {
@@ -23525,7 +26050,7 @@
                 "arguments": {
                   "query": "Perplexity company founding date"
                 },
-                "call_id": "393a2b30-fbe9-44c3-b2b8-4ecdb086785f",
+                "call_id": "ca248109-25af-4737-90cb-6461faaf4e63",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -23630,7 +26155,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge",
+              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
               "type": "tool_call"
             },
             "event_type": {
@@ -23655,7 +26180,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "_search\", \"parameters\": {\"query\": \"Perplexity",
+              "tool_call": "\": {\"query\": \"Perplexity",
               "type": "tool_call"
             },
             "event_type": {
@@ -23709,7 +26234,7 @@
                 "arguments": {
                   "query": "Perplexity company founding date"
                 },
-                "call_id": "84505681-7471-4e1d-8779-916703da7dbb",
+                "call_id": "94a9fd55-7658-482d-8595-d2c2a23b3a1e",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -23933,7 +26458,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "type\": \"function\", \"name\":",
+              "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
               "type": "text"
             },
             "event_type": {
@@ -23953,47 +26478,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " \"knowledge_search\", \"parameters\":",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " {\"query\": \"when was the",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " nba created\"}}",
+              "text": "\": {\"query\": \"when was the nba created\"}}",
               "type": "text"
             },
             "event_type": {
@@ -24022,7 +26507,7 @@
                 "arguments": {
                   "query": "when was the nba created"
                 },
-                "call_id": "e8ac462f-e6e7-4ee8-8d18-09e330454890",
+                "call_id": "7b01a40d-a6a8-4c86-b91d-1790e7480e57",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -24127,7 +26612,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name",
+              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",",
               "type": "tool_call"
             },
             "event_type": {
@@ -24152,7 +26637,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\": \"knowledge_search\", \"parameters\": {\"query\": \"when",
+              "tool_call": " \"parameters\": {\"query\": \"when was the nba created",
               "type": "tool_call"
             },
             "event_type": {
@@ -24177,7 +26662,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " was the nba created\"}}",
+              "tool_call": "\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -24206,7 +26691,7 @@
                 "arguments": {
                   "query": "when was the nba created"
                 },
-                "call_id": "db2abfd7-9fe5-4957-b2b4-84b1f120092b",
+                "call_id": "bbaf750a-0337-4c83-9bf2-76c2f72d45c3",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.json b/tests/integration/fixtures/recorded_responses/invoke_tool.json
index 3e6b6a307..76191e992 100644
--- a/tests/integration/fixtures/recorded_responses/invoke_tool.json
+++ b/tests/integration/fixtures/recorded_responses/invoke_tool.json
@@ -90,6 +90,19 @@
       }
     }
   },
+  "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(code_interpreter.get_file_path(\\\"<TEMP_FILE>\"))\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"code_interpreter\"}]": {
+    "type": "value",
+    "value": {
+      "__module__": "llama_stack.apis.tools.tools",
+      "__pydantic__": "ToolInvocationResult",
+      "data": {
+        "content": "completed\n[stderr]\nTraceback (most recent call last):\n  line 5, in <module>\n    from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]",
+        "error_code": null,
+        "error_message": null,
+        "metadata": null
+      }
+    }
+  },
   "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert 'date' column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot the time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\", \"session_id\": \"<UUID>\"}, \"tool_name\": \"code_interpreter\"}]": {
     "type": "value",
     "value": {
@@ -141,23 +154,23 @@
             "type": "text"
           },
           {
-            "text": "Result 1:\nDocument_id:5c435\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+            "text": "Result 1:\nDocument_id:961ff\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
             "type": "text"
           },
           {
-            "text": "Result 2:\nDocument_id:5c435\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
+            "text": "Result 2:\nDocument_id:961ff\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
             "type": "text"
           },
           {
-            "text": "Result 3:\nDocument_id:5c435\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+            "text": "Result 3:\nDocument_id:961ff\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
             "type": "text"
           },
           {
-            "text": "Result 4:\nDocument_id:5c435\nContent:  from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n  # Assuming that base_model already has the pretrained Llama2 weights,\n  # this will directly load them into your LoRA model without any conversion necessary.\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n  # Fetch all params from the model that are associated with LoRA.\n  lora_params = get_adapter_params(lora_model)\n\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n  set_trainable_params(lora_model, lora_params)\n\n  # Print the total number of parameters\n  total_params = sum([p.numel() for p in lora_model.parameters()])\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n  print(\n    f\"\"\"\n    {total_params} total params,\n    {trainable_params}\" trainable params,\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n    \"\"\"\n  )\n\n  6742609920 total params,\n  4194304 trainable params,\n  0.06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
+            "text": "Result 4:\nDocument_id:961ff\nContent:  from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n  # Assuming that base_model already has the pretrained Llama2 weights,\n  # this will directly load them into your LoRA model without any conversion necessary.\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n  # Fetch all params from the model that are associated with LoRA.\n  lora_params = get_adapter_params(lora_model)\n\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n  set_trainable_params(lora_model, lora_params)\n\n  # Print the total number of parameters\n  total_params = sum([p.numel() for p in lora_model.parameters()])\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n  print(\n    f\"\"\"\n    {total_params} total params,\n    {trainable_params}\" trainable params,\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n    \"\"\"\n  )\n\n  6742609920 total params,\n  4194304 trainable params,\n  0.06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
             "type": "text"
           },
           {
-            "text": "Result 5:\nDocument_id:5c435\nContent: ,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
+            "text": "Result 5:\nDocument_id:961ff\nContent: ,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
             "type": "text"
           },
           {
@@ -169,11 +182,11 @@
         "error_message": null,
         "metadata": {
           "document_ids": [
-            "5c435311-5dba-4b40-b8c9-9fd37fbd9b29",
-            "5c435311-5dba-4b40-b8c9-9fd37fbd9b29",
-            "5c435311-5dba-4b40-b8c9-9fd37fbd9b29",
-            "5c435311-5dba-4b40-b8c9-9fd37fbd9b29",
-            "5c435311-5dba-4b40-b8c9-9fd37fbd9b29"
+            "961ff2d1-8887-41ef-a4fe-fa4cbab7b932",
+            "961ff2d1-8887-41ef-a4fe-fa4cbab7b932",
+            "961ff2d1-8887-41ef-a4fe-fa4cbab7b932",
+            "961ff2d1-8887-41ef-a4fe-fa4cbab7b932",
+            "961ff2d1-8887-41ef-a4fe-fa4cbab7b932"
           ]
         }
       }
@@ -361,23 +374,23 @@
             "type": "text"
           },
           {
-            "text": "Result 1:\nDocument_id:ea3f6\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
+            "text": "Result 1:\nDocument_id:24443\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
             "type": "text"
           },
           {
-            "text": "Result 2:\nDocument_id:5c435\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+            "text": "Result 2:\nDocument_id:961ff\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
             "type": "text"
           },
           {
-            "text": "Result 3:\nDocument_id:91d52\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+            "text": "Result 3:\nDocument_id:b49f7\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
             "type": "text"
           },
           {
-            "text": "Result 4:\nDocument_id:5c435\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+            "text": "Result 4:\nDocument_id:961ff\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
             "type": "text"
           },
           {
-            "text": "Result 5:\nDocument_id:91d52\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+            "text": "Result 5:\nDocument_id:b49f7\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
             "type": "text"
           },
           {
@@ -389,11 +402,11 @@
         "error_message": null,
         "metadata": {
           "document_ids": [
-            "ea3f6e4d-9e11-4bd0-8322-6371f7b0de0c",
-            "5c435311-5dba-4b40-b8c9-9fd37fbd9b29",
-            "91d525eb-07dc-4cad-8596-dd0e6bd011f1",
-            "5c435311-5dba-4b40-b8c9-9fd37fbd9b29",
-            "91d525eb-07dc-4cad-8596-dd0e6bd011f1"
+            "24443dfb-a0b3-4ce8-820e-3fb1f12364bb",
+            "961ff2d1-8887-41ef-a4fe-fa4cbab7b932",
+            "b49f7985-6615-4dcf-99be-d1765b6a6fc6",
+            "961ff2d1-8887-41ef-a4fe-fa4cbab7b932",
+            "b49f7985-6615-4dcf-99be-d1765b6a6fc6"
           ]
         }
       }
@@ -405,7 +418,7 @@
       "__module__": "llama_stack.apis.tools.tools",
       "__pydantic__": "ToolInvocationResult",
       "data": {
-        "content": "{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Meet the Executive CSuite Team of Meta (Facebook) [2025]\", \"url\": \"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\", \"content\": \"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\u2019s finance and facilities team to keep track of the company\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\", \"score\": 0.7602419, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Forbes\", \"url\": \"https://www.forbes.com/profile/mark-zuckerberg/\", \"content\": \"Meta has donated $1 million to President-elect Donald Trump's inaugural fund, the company confirmed to various news outlets on Wednesday, a move that comes just weeks after its CEO Mark\", \"score\": 0.6701125, \"raw_content\": null}, {\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\", \"score\": 0.6175132, \"raw_content\": null}, {\"title\": \"META | Meta Platforms Inc. Company Profile & Executives - WSJ\", \"url\": \"https://www.wsj.com/market-data/quotes/META/company-people\", \"content\": \"Company profile for Meta Platforms Inc. including key executives, insider trading, ownership, revenue and average growth rates. View detailed META description & address.\", \"score\": 0.23361932, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.05564338, \"raw_content\": null}]}",
+        "content": "{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\", \"score\": 0.8342047, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.79099923, \"raw_content\": null}, {\"title\": \"The 11 People Running Meta's $1 Trillion Social Media and ... - Observer\", \"url\": \"https://observer.com/2024/01/meta-facebook-top-executives/\", \"content\": \"Meta has one of the most stable leadership team in the tech industry. Almost all of Meta's top executives have been with the company for well over a decade. ... 39, cofounder, chairman and CEO\", \"score\": 0.45536873, \"raw_content\": null}, {\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Meta leadership: images of senior executives for download to use in articles about the company.\", \"score\": 0.21026355, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.05564338, \"raw_content\": null}]}",
         "error_code": null,
         "error_message": null,
         "metadata": null

From 9028407386feae49c8d878f7661efd362981d1a9 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Fri, 7 Mar 2025 14:03:54 -0800
Subject: [PATCH 27/58] fix: clean up detailed history for CHANGELOG (#1494)

# What does this PR do?

- do not dump all commit history in CHANGELOG
cc @terrytangyuan

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
```
python scripts/gen-changelog.py
```

[//]: # (## Documentation)
---
 CHANGELOG.md             | 1378 +++++---------------------------------
 scripts/gen-changelog.py |   51 +-
 2 files changed, 221 insertions(+), 1208 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5a9911915..2e544e93f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,9 +3,9 @@
 # v0.1.5.1
 Published on: 2025-02-28T22:37:44Z
 
-## What's Changed
-* Fixes for security risk in https://github.com/meta-llama/llama-stack/pull/1327 and https://github.com/meta-llama/llama-stack/pull/1328
-
+## 0.1.5.1 Release Notes
+* Fixes for security risk in https://github.com/meta-llama/llama-stack/pull/1327 and https://github.com/meta-llama/llama-stack/pull/1328
+
 **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.5...v0.1.5.1
 
 ---
@@ -13,839 +13,191 @@ Published on: 2025-02-28T22:37:44Z
 # v0.1.5
 Published on: 2025-02-28T18:14:01Z
 
-## 0.1.5 Release Notes
-###  Build Agents
-* Inference: Support more non-llama models (openai, anthropic, gemini)
-* Inference: Can use the provider's model name in addition to the HF alias
-* Inference: Fixed issues with calling tools that weren't specified in the prompt
-* RAG: Improved system prompt for RAG and no more need for hard-coded rag-tool calling
-* Embeddings: Added support for Nemo retriever embedding models
-* Tools: Added support for MCP tools in Ollama Distribution
-* Distributions: Added new Groq distribution
-
-### Customize Models
-* Save post-trained checkpoint in SafeTensor format to allow Ollama inference provider to use the post-trained model
-
-### Monitor agents
-* More comprehensive logging of agent steps including client tools
-* Telemetry inputs/outputs are now structured and queryable
-* Ability to retrieve agents session, turn, step by ids
-
-### Better Engineering
-* Moved executorch Swift code out of this repo into the llama-stack-client-swift repo, similar to kotlin
-* Move most logging to use logger instead of prints
-* Completed text /chat-completion and /completion tests
-
-## All changes
-* test: add a ci-tests distro template for running e2e tests by @ashwinb in https://github.com/meta-llama/llama-stack/pull/1237
-* refactor: combine start scripts for each env by @cdoern in https://github.com/meta-llama/llama-stack/pull/1139
-* fix: pre-commit updates by @cdoern in https://github.com/meta-llama/llama-stack/pull/1243
-* fix: Update getting_started.ipynb by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/1245
-* fix: Update Llama_Stack_Benchmark_Evals.ipynb by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/1246
-* build: hint on Python version for uv venv by @leseb in https://github.com/meta-llama/llama-stack/pull/1172
-* fix: include timezone in Agent steps' timestamps by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1247
-* LocalInferenceImpl update for LS013 by @jeffxtang in https://github.com/meta-llama/llama-stack/pull/1242
-* fix: Raise exception when tool call result is None by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1253
-* fix: resolve type hint issues and import dependencies by @leseb in https://github.com/meta-llama/llama-stack/pull/1176
-* fix: build_venv expects an extra argument by @cdoern in https://github.com/meta-llama/llama-stack/pull/1233
-* feat: completing text /chat-completion and /completion tests by @LESSuseLESS in https://github.com/meta-llama/llama-stack/pull/1223
-* fix: update index.md to include 0.1.4 by @raghotham in https://github.com/meta-llama/llama-stack/pull/1259
-* docs: Remove $ from client CLI ref  to add valid copy and paste ability by @kelbrown20 in https://github.com/meta-llama/llama-stack/pull/1260
-* feat: Add Groq distribution template by @VladOS95-cyber in https://github.com/meta-llama/llama-stack/pull/1173
-* chore: update the zero_to_hero_guide doc link by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1220
-* build: Merge redundant "files" field for codegen check in .pre-commit-config.yaml by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1261
-* refactor(server): replace print statements with logger by @leseb in https://github.com/meta-llama/llama-stack/pull/1250
-* fix: fix the describe table display issue by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1221
-* chore: update download error message by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1217
-* chore: removed executorch submodule by @jeffxtang in https://github.com/meta-llama/llama-stack/pull/1265
-* refactor: move OpenAI compat utilities from nvidia to openai_compat by @ashwinb in https://github.com/meta-llama/llama-stack/pull/1258
-* feat: add (openai, anthropic, gemini) providers via litellm by @ashwinb in https://github.com/meta-llama/llama-stack/pull/1267
-* feat: [post training] support save hf safetensor format checkpoint by @SLR722 in https://github.com/meta-llama/llama-stack/pull/845
-* fix: the pre-commit new line issue by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1272
-* fix(cli): Missing default for --image-type in stack run command by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1274
-* fix: Get builtin tool calling working in remote-vllm by @bbrowning in https://github.com/meta-llama/llama-stack/pull/1236
-* feat: remove special handling of builtin::rag tool by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1015
-* feat: update the post training notebook by @SLR722 in https://github.com/meta-llama/llama-stack/pull/1280
-* fix: time logging format by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1281
-* feat: allow specifying specific tool within toolgroup by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1239
-* fix: sqlite conn by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1282
-* chore: upgrade uv pre-commit version, uv-sync -> uv-lock by @ashwinb in https://github.com/meta-llama/llama-stack/pull/1284
-* fix: don't attempt to clean gpu memory up when device is cpu by @booxter in https://github.com/meta-llama/llama-stack/pull/1191
-* feat: Add model context protocol tools with ollama provider by @Shreyanand in https://github.com/meta-llama/llama-stack/pull/1283
-* fix(test): update client-sdk tests to handle tool format parametrization better by @ashwinb in https://github.com/meta-llama/llama-stack/pull/1287
-* feat: add nemo retriever text embedding models to nvidia inference provider by @mattf in https://github.com/meta-llama/llama-stack/pull/1218
-* feat: don't silently ignore incorrect toolgroup by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1285
-* feat: ability to retrieve agents session, turn, step by ids by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/1286
-* fix(test): no need to specify tool prompt format explicitly in tests by @ashwinb in https://github.com/meta-llama/llama-stack/pull/1295
-* chore: remove vector_db_id from AgentSessionInfo by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/1296
-* fix: Revert "chore: remove vector_db_id from AgentSessionInfo" by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/1299
-* feat(providers): Groq now uses LiteLLM openai-compat by @ashwinb in https://github.com/meta-llama/llama-stack/pull/1303
-* fix: duplicate ToolResponseMessage in Turn message history by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/1305
-* fix: don't include tool args not in the function definition by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1307
-* fix: update notebooks to avoid using the nutsy --image-name __system__ thing by @ashwinb in https://github.com/meta-llama/llama-stack/pull/1308
-* fix: register provider model name and HF alias in run.yaml by @ashwinb in https://github.com/meta-llama/llama-stack/pull/1304
-* build: Add dotenv file for running tests with uv by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1251
-* docs: update the output of llama-stack-client models list by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1271
-* fix: Avoid unexpected keyword argument for sentence_transformers by @luis5tb in https://github.com/meta-llama/llama-stack/pull/1269
-* feat: add nvidia embedding implementation for new signature, task_type, output_dimention, text_truncation by @mattf in https://github.com/meta-llama/llama-stack/pull/1213
-* chore: add subcommands description in help by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1219
-* fix: Structured outputs for recursive models by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/1311
-* fix: litellm tool call parsing event type to in_progress by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/1312
-* fix: Incorrect import path for print_subcommand_description() by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1313
-* fix: Incorrect import path for print_subcommand_description() by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1314
-* fix: Incorrect import path for print_subcommand_description() by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1315
-* test: Only run embedding tests for remote::nvidia by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1317
-* fix: update getting_started notebook to pass nbeval by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/1318
-* fix: [Litellm]Do not swallow first token by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/1316
-* feat: update the default system prompt for 3.2/3.3 models by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1310
-* fix: Agent telemetry inputs/outputs should be structured by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/1302
-* fix: check conda env name using basepath in exec.py by @dineshyv in https://github.com/meta-llama/llama-stack/pull/1301
-
-## New Contributors
-* @Shreyanand made their first contribution in https://github.com/meta-llama/llama-stack/pull/1283
-* @luis5tb made their first contribution in https://github.com/meta-llama/llama-stack/pull/1269
-
-**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.4...v0.1.5
+## 0.1.5 Release Notes
+###  Build Agents
+* Inference: Support more non-llama models (openai, anthropic, gemini)
+* Inference: Can use the provider's model name in addition to the HF alias
+* Inference: Fixed issues with calling tools that weren't specified in the prompt
+* RAG: Improved system prompt for RAG and no more need for hard-coded rag-tool calling
+* Embeddings: Added support for Nemo retriever embedding models
+* Tools: Added support for MCP tools in Ollama Distribution
+* Distributions: Added new Groq distribution
+
+### Customize Models
+* Save post-trained checkpoint in SafeTensor format to allow Ollama inference provider to use the post-trained model
+
+### Monitor agents
+* More comprehensive logging of agent steps including client tools
+* Telemetry inputs/outputs are now structured and queryable
+* Ability to retrieve agents session, turn, step by ids
+
+### Better Engineering
+* Moved executorch Swift code out of this repo into the llama-stack-client-swift repo, similar to kotlin
+* Move most logging to use logger instead of prints
+* Completed text /chat-completion and /completion tests
+
 
 ---
 
 # v0.1.4
 Published on: 2025-02-25T00:02:43Z
 
-## v0.1.4 Release Notes
-Here are the key changes coming as part of this release:
-
-### Build and Test Agents
-* Inference: Added support for non-llama models
-* Inference: Added option to list all downloaded models and remove models
-* Agent: Introduce new api agents.resume_turn to include client side tool execution in the same turn
-* Agent: AgentConfig introduces new variable “tool_config” that allows for better tool configuration and system prompt overrides
-* Agent: Added logging for agent step start and completion times
-* Agent: Added support for logging for tool execution metadata
-* Embedding: Updated /inference/embeddings to support asymmetric models, truncation and variable sized outputs
-* Embedding: Updated embedding models for Ollama, Together, and Fireworks with available defaults
-* VectorIO: Improved performance of sqlite-vec using chunked writes
-### Agent Evals and Model Customization
-* Deprecated api /eval-tasks. Use /eval/benchmark  instead
-* Added CPU training support for TorchTune
-### Deploy and Monitoring of Agents
-* Consistent view of client and server tool calls in telemetry
-### Better Engineering
-* Made tests more data-driven for consistent evaluation
-* Fixed documentation links and improved API reference generation
-* Various small fixes for build scripts and system reliability
-
-
-## What's Changed
-* build: resync uv and deps on 0.1.3 by @leseb in https://github.com/meta-llama/llama-stack/pull/1108
-* style: fix the capitalization issue by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1117
-* feat: log start, complete time to Agent steps by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1116
-* fix: Ensure a tool call can be converted before adding to buffer by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1119
-* docs: Fix incorrect link and command for generating API reference by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1124
-* chore: remove --no-list-templates option by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1121
-* style: update verify-download help text by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1134
-* style: update download help text by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1135
-* fix: modify the model id title for model list by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1095
-* fix: direct client pydantic type casting by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/1145
-* style: remove prints in codebase by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/1146
-* feat: support tool_choice = {required, none, <function>} by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1059
-* test: Enable test_text_chat_completion_with_tool_choice_required for remote::vllm by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1148
-* fix(rag-example): add provider_id to avoid llama_stack_client 400 error by @fulvius31 in https://github.com/meta-llama/llama-stack/pull/1114
-* fix: Get distro_codegen.py working with default deps and enabled in pre-commit hooks by @bbrowning in https://github.com/meta-llama/llama-stack/pull/1123
-* chore: remove llama_models.llama3.api imports from providers by @ashwinb in https://github.com/meta-llama/llama-stack/pull/1107
-* docs: fix Python llama_stack_client SDK links by @leseb in https://github.com/meta-llama/llama-stack/pull/1150
-* feat: Chunk sqlite-vec writes by @franciscojavierarceo in https://github.com/meta-llama/llama-stack/pull/1094
-* fix: miscellaneous job management improvements in torchtune by @booxter in https://github.com/meta-llama/llama-stack/pull/1136
-* feat: add aggregation_functions to llm_as_judge_405b_simpleqa by @SLR722 in https://github.com/meta-llama/llama-stack/pull/1164
-* feat: inference passthrough provider  by @SLR722 in https://github.com/meta-llama/llama-stack/pull/1166
-* docs: Remove unused python-openapi and json-strong-typing in openapi_generator by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1167
-* docs: improve API contribution guidelines by @leseb in https://github.com/meta-llama/llama-stack/pull/1137
-* feat: add a option to list the downloaded models by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1127
-* fix: Fixing some small issues with the build scripts by @franciscojavierarceo in https://github.com/meta-llama/llama-stack/pull/1132
-* fix: llama stack build use UV_SYSTEM_PYTHON to install dependencies to system environment by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/1163
-* build: add missing dev dependencies for unit tests by @leseb in https://github.com/meta-llama/llama-stack/pull/1004
-* fix: More robust handling of the arguments in tool call response in remote::vllm by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1169
-* Added support for mongoDB KV store by @shrinitg in https://github.com/meta-llama/llama-stack/pull/543
-* script for running client sdk tests by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/895
-* test: skip model registration for unsupported providers by @leseb in https://github.com/meta-llama/llama-stack/pull/1030
-* feat: Enable CPU training for torchtune by @booxter in https://github.com/meta-llama/llama-stack/pull/1140
-* fix: add logging import by @raspawar in https://github.com/meta-llama/llama-stack/pull/1174
-* docs: Add note about distro_codegen.py and provider dependencies by @bbrowning in https://github.com/meta-llama/llama-stack/pull/1175
-* chore: slight renaming of model alias stuff by @ashwinb in https://github.com/meta-llama/llama-stack/pull/1181
-* feat: adding endpoints for files and uploads by @vladimirivic in https://github.com/meta-llama/llama-stack/pull/1070
-* docs: Fix Links, Add Podman Instructions, Vector DB Unregister, and Example Script by @kevincogan in https://github.com/meta-llama/llama-stack/pull/1129
-* chore!: deprecate eval/tasks by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/1186
-* fix: some telemetry APIs don't currently work by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1188
-* feat: D69478008 [llama-stack] turning tests into data-driven by @LESSuseLESS in https://github.com/meta-llama/llama-stack/pull/1180
-* feat: register embedding models for ollama, together, fireworks by @ashwinb in https://github.com/meta-llama/llama-stack/pull/1190
-* feat(providers): add NVIDIA Inference embedding provider and tests by @mattf in https://github.com/meta-llama/llama-stack/pull/935
-* docs: Add missing uv command for docs generation in contributing guide by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1197
-* docs: Simplify installation guide with `uv` by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1196
-* fix: BuiltinTool JSON serialization in remote vLLM provider by @bbrowning in https://github.com/meta-llama/llama-stack/pull/1183
-* ci: improve GitHub Actions workflow for website builds by @leseb in https://github.com/meta-llama/llama-stack/pull/1151
-* fix: pass tool_prompt_format to chat_formatter by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1198
-* fix(api): update embeddings signature so inputs and outputs list align by @ashwinb in https://github.com/meta-llama/llama-stack/pull/1161
-* feat(api): Add options for supporting various embedding models by @ashwinb in https://github.com/meta-llama/llama-stack/pull/1192
-* fix: update URL import, URL -> ImageContentItemImageURL by @mattf in https://github.com/meta-llama/llama-stack/pull/1204
-* feat: model remove cmd by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1128
-* chore: remove configure subcommand by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1202
-* fix: remove list of list tests, no longer relevant after #1161 by @mattf in https://github.com/meta-llama/llama-stack/pull/1205
-* test(client-sdk): Update embedding test types to use latest imports by @raspawar in https://github.com/meta-llama/llama-stack/pull/1203
-* fix: convert back to model descriptor for model in list --downloaded by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1201
-* docs: Add missing uv command and clarify website rebuild by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1199
-* fix: Updating images so that they are able to run without root access by @jland-redhat in https://github.com/meta-llama/llama-stack/pull/1208
-* fix: pull ollama embedding model if necessary by @ashwinb in https://github.com/meta-llama/llama-stack/pull/1209
-* chore: move embedding deps to RAG tool where they are needed by @ashwinb in https://github.com/meta-llama/llama-stack/pull/1210
-* feat(1/n): api: unify agents for handling server & client tools by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/1178
-* feat: tool outputs metadata by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1155
-* ci: add mypy for static type checking by @leseb in https://github.com/meta-llama/llama-stack/pull/1101
-* feat(providers): support non-llama models for inference providers by @ashwinb in https://github.com/meta-llama/llama-stack/pull/1200
-* test: fix test_rag_agent test by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1215
-* feat: add substring search for model list by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1099
-* test: do not overwrite agent_config by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1216
-* docs: Adding Provider sections to docs by @franciscojavierarceo in https://github.com/meta-llama/llama-stack/pull/1195
-* fix: update virtualenv building so llamastack- prefix is not added, make notebook experience easier by @ashwinb in https://github.com/meta-llama/llama-stack/pull/1225
-* feat: add --run to llama stack build by @cdoern in https://github.com/meta-llama/llama-stack/pull/1156
-* docs: Add vLLM to the list of inference providers in concepts and providers pages by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1227
-* docs: small fixes by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1224
-* fix: avoid failure when no special pip deps and better exit by @leseb in https://github.com/meta-llama/llama-stack/pull/1228
-* fix: set default tool_prompt_format in inference api by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1214
-* test: fix test_tool_choice by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1234
-
-## New Contributors
-* @fulvius31 made their first contribution in https://github.com/meta-llama/llama-stack/pull/1114
-* @shrinitg made their first contribution in https://github.com/meta-llama/llama-stack/pull/543
-* @raspawar made their first contribution in https://github.com/meta-llama/llama-stack/pull/1174
-* @kevincogan made their first contribution in https://github.com/meta-llama/llama-stack/pull/1129
-* @LESSuseLESS made their first contribution in https://github.com/meta-llama/llama-stack/pull/1180
-* @jland-redhat made their first contribution in https://github.com/meta-llama/llama-stack/pull/1208
-
-**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.3...v0.1.4
+## v0.1.4 Release Notes
+Here are the key changes coming as part of this release:
+
+### Build and Test Agents
+* Inference: Added support for non-llama models
+* Inference: Added option to list all downloaded models and remove models
+* Agent: Introduce new api agents.resume_turn to include client side tool execution in the same turn
+* Agent: AgentConfig introduces new variable “tool_config” that allows for better tool configuration and system prompt overrides
+* Agent: Added logging for agent step start and completion times
+* Agent: Added support for logging for tool execution metadata
+* Embedding: Updated /inference/embeddings to support asymmetric models, truncation and variable sized outputs
+* Embedding: Updated embedding models for Ollama, Together, and Fireworks with available defaults
+* VectorIO: Improved performance of sqlite-vec using chunked writes
+### Agent Evals and Model Customization
+* Deprecated api /eval-tasks. Use /eval/benchmark  instead
+* Added CPU training support for TorchTune
+### Deploy and Monitoring of Agents
+* Consistent view of client and server tool calls in telemetry
+### Better Engineering
+* Made tests more data-driven for consistent evaluation
+* Fixed documentation links and improved API reference generation
+* Various small fixes for build scripts and system reliability
+
+
 
 ---
 
 # v0.1.3
 Published on: 2025-02-14T20:24:32Z
 
-## v0.1.3 Release
-
-Here are some key changes that are coming as part of this release.
-
-### Build and Test Agents
-Streamlined the initial development experience
-- Added support for  llama stack run --image-type venv
-- Enhanced vector store options with new sqlite-vec provider and improved Qdrant integration
-- vLLM improvements for tool calling and logprobs
-- Better handling of sporadic code_interpreter tool calls
-
-### Agent Evals
-Better benchmarking and Agent performance assessment
-- Renamed eval API /eval-task to /benchmarks
-- Improved documentation and notebooks for RAG and evals
-
-### Deploy and Monitoring of Agents
-Improved production readiness
-- Added usage metrics collection for chat completions
-- CLI improvements for provider information
-- Improved error handling and system reliability
-- Better model endpoint handling and accessibility
-- Improved signal handling on distro server
-
-### Better Engineering
-Infrastructure and code quality improvements
-- Faster text-based chat completion tests
-- Improved testing for non-streaming agent apis
-- Standardized import formatting with ruff linter
-- Added conventional commits standard
-- Fixed documentation parsing issues
-
-## What's Changed
-* Getting started notebook update by @jeffxtang in https://github.com/meta-llama/llama-stack/pull/936
-* docs: update index.md for 0.1.2 by @raghotham in https://github.com/meta-llama/llama-stack/pull/1013
-* test: Make text-based chat completion tests run 10x faster by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1016
-* chore: Updated requirements.txt by @cheesecake100201 in https://github.com/meta-llama/llama-stack/pull/1017
-* test: Use JSON tool prompt format for remote::vllm provider by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1019
-* docs: Render check marks correctly on PyPI by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1024
-* docs: update rag.md example code to prevent errors by @MichaelClifford in https://github.com/meta-llama/llama-stack/pull/1009
-* build: update uv lock to sync package versions by @leseb in https://github.com/meta-llama/llama-stack/pull/1026
-* fix: Gaps in doc codegen by @ellistarn in https://github.com/meta-llama/llama-stack/pull/1035
-* fix: Readthedocs cannot parse comments, resulting in docs bugs by @ellistarn in https://github.com/meta-llama/llama-stack/pull/1033
-* fix: a bad newline in ollama docs by @ellistarn in https://github.com/meta-llama/llama-stack/pull/1036
-* fix: Update Qdrant support post-refactor by @jwm4 in https://github.com/meta-llama/llama-stack/pull/1022
-* test: replace blocked image URLs with GitHub-hosted by @leseb in https://github.com/meta-llama/llama-stack/pull/1025
-* fix: Added missing `tool_config` arg in SambaNova `chat_completion()` by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1042
-* docs: Updating wording and nits in the README.md by @kelbrown20 in https://github.com/meta-llama/llama-stack/pull/992
-* docs: remove changelog mention from PR template by @leseb in https://github.com/meta-llama/llama-stack/pull/1049
-* docs: reflect actual number of spaces for indent by @booxter in https://github.com/meta-llama/llama-stack/pull/1052
-* fix: agent config validation by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1053
-* feat: add MetricResponseMixin to chat completion response types by @dineshyv in https://github.com/meta-llama/llama-stack/pull/1050
-* feat: make telemetry attributes be dict[str,PrimitiveType] by @dineshyv in https://github.com/meta-llama/llama-stack/pull/1055
-* fix: filter out remote::sample providers when listing by @booxter in https://github.com/meta-llama/llama-stack/pull/1057
-* feat: Support tool calling for non-streaming chat completion in remote vLLM provider by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1034
-* perf: ensure ToolCall in ChatCompletionResponse is subset of ChatCompletionRequest.tools by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/1041
-* chore: update return type to Optional[str] by @leseb in https://github.com/meta-llama/llama-stack/pull/982
-* feat: Support tool calling for streaming chat completion in remote vLLM provider by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1063
-* fix: show proper help text by @cdoern in https://github.com/meta-llama/llama-stack/pull/1065
-* feat: add support for running in a venv by @cdoern in https://github.com/meta-llama/llama-stack/pull/1018
-* feat: Adding sqlite-vec as a vectordb by @franciscojavierarceo in https://github.com/meta-llama/llama-stack/pull/1040
-* feat: support listing all for `llama stack list-providers` by @booxter in https://github.com/meta-llama/llama-stack/pull/1056
-* docs: Mention convential commits format in CONTRIBUTING.md by @bbrowning in https://github.com/meta-llama/llama-stack/pull/1075
-* fix: logprobs support in remote-vllm provider by @bbrowning in https://github.com/meta-llama/llama-stack/pull/1074
-* fix: improve signal handling and update dependencies by @leseb in https://github.com/meta-llama/llama-stack/pull/1044
-* style: update model id in model list title by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1072
-* fix: make backslash work in GET /models/{model_id:path} by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/1068
-* chore: Link to Groq docs in the warning message for preview model by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1060
-* fix: remove :path in agents by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/1077
-* build: format codebase imports using ruff linter by @leseb in https://github.com/meta-llama/llama-stack/pull/1028
-* chore: Consistent naming for VectorIO providers by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1023
-* test: Enable logprobs top_k tests for remote::vllm by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1080
-* docs: Fix url to the llama-stack-spec yaml/html files by @vishnoianil in https://github.com/meta-llama/llama-stack/pull/1081
-* fix: Update VectorIO config classes in registry by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1079
-* test: Add qdrant to provider tests by @jwm4 in https://github.com/meta-llama/llama-stack/pull/1039
-* test: add test for Agent.create_turn non-streaming response by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1078
-* fix!: update eval-tasks -> benchmarks by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/1032
-* fix: openapi for eval-task by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/1085
-* fix: regex pattern matching to support :path suffix in the routes by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/1089
-* fix: disable sqlite-vec test by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/1090
-* fix: add the missed help description info by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1096
-* fix: Update QdrantConfig to QdrantVectorIOConfig by @bbrowning in https://github.com/meta-llama/llama-stack/pull/1104
-* docs: Add region parameter to Bedrock provider by @raghotham in https://github.com/meta-llama/llama-stack/pull/1103
-* build: configure ruff from pyproject.toml by @leseb in https://github.com/meta-llama/llama-stack/pull/1100
-* chore: move all Llama Stack types from llama-models to llama-stack by @ashwinb in https://github.com/meta-llama/llama-stack/pull/1098
-* fix: enable_session_persistence in AgentConfig should be optional by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1012
-* fix: improve stack build on venv by @leseb in https://github.com/meta-llama/llama-stack/pull/980
-* fix: remove the empty line by @reidliu41 in https://github.com/meta-llama/llama-stack/pull/1097
-
-## New Contributors
-* @MichaelClifford made their first contribution in https://github.com/meta-llama/llama-stack/pull/1009
-* @ellistarn made their first contribution in https://github.com/meta-llama/llama-stack/pull/1035
-* @kelbrown20 made their first contribution in https://github.com/meta-llama/llama-stack/pull/992
-* @franciscojavierarceo made their first contribution in https://github.com/meta-llama/llama-stack/pull/1040
-* @bbrowning made their first contribution in https://github.com/meta-llama/llama-stack/pull/1075
-* @reidliu41 made their first contribution in https://github.com/meta-llama/llama-stack/pull/1072
-* @vishnoianil made their first contribution in https://github.com/meta-llama/llama-stack/pull/1081
-
-**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.2...v0.1.3
+## v0.1.3 Release
+
+Here are some key changes that are coming as part of this release.
+
+### Build and Test Agents
+Streamlined the initial development experience
+- Added support for  llama stack run --image-type venv
+- Enhanced vector store options with new sqlite-vec provider and improved Qdrant integration
+- vLLM improvements for tool calling and logprobs
+- Better handling of sporadic code_interpreter tool calls
+
+### Agent Evals
+Better benchmarking and Agent performance assessment
+- Renamed eval API /eval-task to /benchmarks
+- Improved documentation and notebooks for RAG and evals
+
+### Deploy and Monitoring of Agents
+Improved production readiness
+- Added usage metrics collection for chat completions
+- CLI improvements for provider information
+- Improved error handling and system reliability
+- Better model endpoint handling and accessibility
+- Improved signal handling on distro server
+
+### Better Engineering
+Infrastructure and code quality improvements
+- Faster text-based chat completion tests
+- Improved testing for non-streaming agent apis
+- Standardized import formatting with ruff linter
+- Added conventional commits standard
+- Fixed documentation parsing issues
+
 
 ---
 
 # v0.1.2
 Published on: 2025-02-07T22:06:49Z
 
-# TL;DR
-- Several stabilizations to development flows after the switch to `uv`
-- Migrated CI workflows to new OSS repo - [llama-stack-ops](https://github.com/meta-llama/llama-stack-ops)
-- Added automated rebuilds for ReadTheDocs
-- Llama Stack server supports HTTPS
-- Added system prompt overrides support
-- Several bug fixes and improvements to documentation (check out Kubernetes deployment guide by @terrytangyuan )
-
-## What's Changed
-* Fix UBI9 image build when installing Python packages via uv by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/926
-* Fix precommit check after moving to ruff by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/927
-* LocalInferenceImpl update for LS 0.1 by @jeffxtang in https://github.com/meta-llama/llama-stack/pull/911
-* Properly close PGVector DB connection during shutdown() by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/931
-* Add issue template config with docs and Discord links by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/930
-* Fix uv pip install timeout issue for PyTorch by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/929
-* github: ignore non-hidden python virtual environments by @nathan-weinberg in https://github.com/meta-llama/llama-stack/pull/939
-* fix: broken link in Quick Start doc by @nathan-weinberg in https://github.com/meta-llama/llama-stack/pull/943
-* fix: broken "core concepts" link in docs website by @nathan-weinberg in https://github.com/meta-llama/llama-stack/pull/940
-* Misc fixes by @ashwinb in https://github.com/meta-llama/llama-stack/pull/944
-* fix: formatting for ollama note in Quick Start doc by @nathan-weinberg in https://github.com/meta-llama/llama-stack/pull/945
-* [docs] typescript sdk readme by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/946
-* Support sys_prompt behavior in inference by @ehhuang in https://github.com/meta-llama/llama-stack/pull/937
-* if client.initialize fails, the example should exit by @cdoern in https://github.com/meta-llama/llama-stack/pull/954
-* Add Podman instructions to Quick Start by @jwm4 in https://github.com/meta-llama/llama-stack/pull/957
-* github: issue templates automatically apply relevant label by @nathan-weinberg in https://github.com/meta-llama/llama-stack/pull/956
-* docs: miscellaneous small fixes by @booxter in https://github.com/meta-llama/llama-stack/pull/961
-* Make a couple properties optional by @ashwinb in https://github.com/meta-llama/llama-stack/pull/963
-* [docs] Make RAG example self-contained by @booxter in https://github.com/meta-llama/llama-stack/pull/962
-* docs, tests: replace datasets.rst with memory_optimizations.rst by @booxter in https://github.com/meta-llama/llama-stack/pull/968
-* Fix broken pgvector provider and memory leaks by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/947
-* [docs] update the zero_to_hero_guide llama stack version to 0.1.0 by @kami619 in https://github.com/meta-llama/llama-stack/pull/960
-* missing T in import by @cooktheryan in https://github.com/meta-llama/llama-stack/pull/974
-* Fix README.md notebook links by @aakankshaduggal in https://github.com/meta-llama/llama-stack/pull/976
-* docs: clarify host.docker.internal works for recent podman by @booxter in https://github.com/meta-llama/llama-stack/pull/977
-* docs: add addn server guidance for Linux users in Quick Start by @nathan-weinberg in https://github.com/meta-llama/llama-stack/pull/972
-* sys_prompt support in Agent by @ehhuang in https://github.com/meta-llama/llama-stack/pull/938
-* chore: update PR template to reinforce changelog by @leseb in https://github.com/meta-llama/llama-stack/pull/988
-* github: update PR template to use correct syntax to auto-close issues by @booxter in https://github.com/meta-llama/llama-stack/pull/989
-* chore: remove unused argument by @cdoern in https://github.com/meta-llama/llama-stack/pull/987
-* test: replace memory with vector_io fixture by @leseb in https://github.com/meta-llama/llama-stack/pull/984
-* docs: use uv in CONTRIBUTING guide by @leseb in https://github.com/meta-llama/llama-stack/pull/970
-* docs: Add license badge to README.md by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/994
-* Add Kubernetes deployment guide by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/899
-* Fix incorrect handling of chat completion endpoint in remote::vLLM by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/951
-* ci: Add semantic PR title check by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/979
-* feat: Add a new template for `dell` by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/978
-* docs: Correct typos in Zero to Hero guide by @mlecanu in https://github.com/meta-llama/llama-stack/pull/997
-* fix: Update rag examples to use fresh faiss index every time by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/998
-* doc: getting started notebook by @ehhuang in https://github.com/meta-llama/llama-stack/pull/996
-* test: fix flaky agent test by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1002
-* test: rm unused exception alias in pytest.raises by @leseb in https://github.com/meta-llama/llama-stack/pull/991
-* fix: List providers command prints out non-existing APIs from registry. Fixes #966 by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/969
-* chore: add missing ToolConfig import in groq.py by @leseb in https://github.com/meta-llama/llama-stack/pull/983
-* test: remove flaky agent test by @ehhuang in https://github.com/meta-llama/llama-stack/pull/1006
-* test: Split inference tests to text and vision by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/1008
-* feat: Add HTTPS serving option by @ashwinb in https://github.com/meta-llama/llama-stack/pull/1000
-* test: encode image data as base64 by @leseb in https://github.com/meta-llama/llama-stack/pull/1003
-* fix: Ensure a better error stack trace when llama-stack is not built by @cdoern in https://github.com/meta-llama/llama-stack/pull/950
-* refactor(ollama): model availability check by @leseb in https://github.com/meta-llama/llama-stack/pull/986
-
-## New Contributors
-* @nathan-weinberg made their first contribution in https://github.com/meta-llama/llama-stack/pull/939
-* @cdoern made their first contribution in https://github.com/meta-llama/llama-stack/pull/954
-* @jwm4 made their first contribution in https://github.com/meta-llama/llama-stack/pull/957
-* @booxter made their first contribution in https://github.com/meta-llama/llama-stack/pull/961
-* @kami619 made their first contribution in https://github.com/meta-llama/llama-stack/pull/960
-* @cooktheryan made their first contribution in https://github.com/meta-llama/llama-stack/pull/974
-* @aakankshaduggal made their first contribution in https://github.com/meta-llama/llama-stack/pull/976
-* @leseb made their first contribution in https://github.com/meta-llama/llama-stack/pull/988
-* @mlecanu made their first contribution in https://github.com/meta-llama/llama-stack/pull/997
-
-**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.1...v0.1.2
+# TL;DR
+- Several stabilizations to development flows after the switch to `uv`
+- Migrated CI workflows to new OSS repo - [llama-stack-ops](https://github.com/meta-llama/llama-stack-ops)
+- Added automated rebuilds for ReadTheDocs
+- Llama Stack server supports HTTPS
+- Added system prompt overrides support
+- Several bug fixes and improvements to documentation (check out Kubernetes deployment guide by @terrytangyuan )
+
 
 ---
 
 # v0.1.1
 Published on: 2025-02-02T02:29:24Z
 
-A bunch of small / big improvements everywhere including support for Windows, switching to `uv` and many provider improvements.
-
-## What's Changed
-* Update doc templates for running safety on self-hosted templates by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/874
-* Update GH action so it correctly queries for test.pypi, etc. by @ashwinb in https://github.com/meta-llama/llama-stack/pull/875
-* Fix report generation for url endpoints by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/876
-* Fixed typo by @BakungaBronson in https://github.com/meta-llama/llama-stack/pull/877
-* Fixed multiple typos by @BakungaBronson in https://github.com/meta-llama/llama-stack/pull/878
-* Ensure llama stack build --config <> --image-type <> works by @ashwinb in https://github.com/meta-llama/llama-stack/pull/879
-* Update documentation by @ashwinb in https://github.com/meta-llama/llama-stack/pull/865
-* Update discriminator to have the correct `mapping` by @ashwinb in https://github.com/meta-llama/llama-stack/pull/881
-* Fix telemetry init by @dineshyv in https://github.com/meta-llama/llama-stack/pull/885
-* Sambanova - LlamaGuard by @snova-edwardm in https://github.com/meta-llama/llama-stack/pull/886
-* Update index.md by @Ckhanoyan in https://github.com/meta-llama/llama-stack/pull/888
-* Report generation minor fixes by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/884
-* adding readme to docs folder for easier discoverability of notebooks … by @heyjustinai in https://github.com/meta-llama/llama-stack/pull/857
-* Agent response format by @hanzlfs in https://github.com/meta-llama/llama-stack/pull/660
-* Add windows support for build execution by @VladOS95-cyber in https://github.com/meta-llama/llama-stack/pull/889
-* Add run win command for stack by @VladOS95-cyber in https://github.com/meta-llama/llama-stack/pull/890
-* Use ruamel.yaml to format the OpenAPI spec by @ashwinb in https://github.com/meta-llama/llama-stack/pull/892
-* Fix Chroma adapter by @ashwinb in https://github.com/meta-llama/llama-stack/pull/893
-* align with CompletionResponseStreamChunk.delta as str (instead of TextDelta) by @mattf in https://github.com/meta-llama/llama-stack/pull/900
-* add NVIDIA_BASE_URL and NVIDIA_API_KEY to control hosted vs local endpoints by @mattf in https://github.com/meta-llama/llama-stack/pull/897
-* Fix validator of "container" image type by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/901
-* Update OpenAPI generator to add param and field documentation by @ashwinb in https://github.com/meta-llama/llama-stack/pull/896
-* Fix link to selection guide and change "docker" to "container" by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/898
-* [#432] Groq Provider tool call tweaks by @aidando73 in https://github.com/meta-llama/llama-stack/pull/811
-* Fix running stack built with base conda environment by @dvrogozh in https://github.com/meta-llama/llama-stack/pull/903
-* create a github action for triggering client-sdk tests on new pull-request by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/850
-* log probs - mark pytests as xfail for unsupported providers + add support for together by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/883
-* SambaNova supports Llama 3.3 by @snova-edwardm in https://github.com/meta-llama/llama-stack/pull/905
-* fix ImageContentItem to take base64 string as image.data by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/909
-* Fix Agents to support code and rag simultaneously by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/908
-* add test for user message w/ image.data content by @mattf in https://github.com/meta-llama/llama-stack/pull/906
-* openapi gen return type fix for streaming/non-streaming by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/910
-* feat: enable xpu support for meta-reference stack by @dvrogozh in https://github.com/meta-llama/llama-stack/pull/558
-* Sec fixes as raised by bandit by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/917
-* Run code-gen by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/916
-* fix rag tests by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/918
-* Use `uv pip install` instead of `pip install` by @ashwinb in https://github.com/meta-llama/llama-stack/pull/921
-* add image support to NVIDIA inference provider by @mattf in https://github.com/meta-llama/llama-stack/pull/907
-
-## New Contributors
-* @BakungaBronson made their first contribution in https://github.com/meta-llama/llama-stack/pull/877
-* @Ckhanoyan made their first contribution in https://github.com/meta-llama/llama-stack/pull/888
-* @hanzlfs made their first contribution in https://github.com/meta-llama/llama-stack/pull/660
-* @dvrogozh made their first contribution in https://github.com/meta-llama/llama-stack/pull/903
-
-**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.0...v0.1.1
+A bunch of small / big improvements everywhere including support for Windows, switching to `uv` and many provider improvements.
+
 
 ---
 
 # v0.1.0
 Published on: 2025-01-24T17:47:47Z
 
-We are excited to announce a stable API release of Llama Stack, which enables developers to build RAG applications and Agents using tools and safety shields, monitor and those agents with telemetry, and evaluate the agent with scoring functions.
-
-## Context
-GenAI application developers need more than just an LLM - they need to integrate tools, connect with their data sources, establish guardrails, and ground the LLM responses effectively. Currently, developers must piece together various tools and APIs, complicating the development lifecycle and increasing costs. The result is that developers are spending more time on these integrations rather than focusing on the application logic itself. The bespoke coupling of components also makes it challenging to adopt state-of-the-art solutions in the rapidly evolving GenAI space. This is particularly difficult for open models like Llama, as best practices are not widely established in the open.
-
-Llama Stack was created to provide developers with a comprehensive and coherent interface that simplifies AI application development and codifies best practices across the Llama ecosystem. Since our launch in September 2024, we have seen a huge uptick in interest in Llama Stack APIs by both AI developers and from partners building AI services with Llama models. Partners like Nvidia, Fireworks, and Ollama have collaborated with us to develop implementations across various APIs, including inference, memory, and safety.
-
-With Llama Stack, you can easily build a RAG agent which can also search the web, do complex math, and custom tool calling. You can use telemetry to inspect those traces, and convert telemetry into evals datasets. And with Llama Stack’s plugin architecture and prepackage distributions, you choose to run your agent anywhere - in the cloud with our partners, deploy your own environment using virtualenv, conda, or Docker, operate locally with Ollama, or even run on mobile devices with our SDKs. Llama Stack offers unprecedented flexibility while also simplifying the developer experience.
-
-## Release
-After iterating on the APIs for the last 3 months, today we’re launching a stable release (V1) of the Llama Stack APIs and the corresponding llama-stack server and client packages(v0.1.0). We now have automated tests for providers. These tests make sure that all provider implementations are verified. Developers can now easily and reliably select distributions or providers based on their specific requirements.
-
-There are example standalone apps in llama-stack-apps.
-
-
-## Key Features of this release
-
-- **Unified API Layer**
-  - Inference: Run LLM models
-  - RAG: Store and retrieve knowledge for RAG
-  - Agents: Build multi-step agentic workflows
-  - Tools: Register tools that can be called by the agent
-  - Safety: Apply content filtering and safety policies
-  - Evaluation: Test model and agent quality
-  - Telemetry: Collect and analyze usage data and complex agentic traces
-  - Post Training ( Coming Soon ): Fine tune models for specific use cases
-
-- **Rich Provider Ecosystem**
-  - Local Development: Meta's Reference, Ollama
-  - Cloud: Fireworks, Together, Nvidia, AWS Bedrock, Groq, Cerebras
-  - On-premises: Nvidia NIM, vLLM, TGI, Dell-TGI
-  - On-device: iOS and Android support
-
-- **Built for Production**
-  - Pre-packaged distributions for common deployment scenarios
-  - Backwards compatibility across model versions
-  - Comprehensive evaluation capabilities
-  - Full observability and monitoring
-
-- **Multiple developer interfaces**
-  - CLI: Command line interface
-  - Python SDK
-  - Swift iOS SDK
-  - Kotlin Android SDK
-
-- **Sample llama stack applications**
-  - Python
-  - iOS
-  - Android
-
-
-### What's Changed
-* [4/n][torchtune integration] support lazy load model during inference by @SLR722 in https://github.com/meta-llama/llama-stack/pull/620
-* remove unused telemetry related code for console by @dineshyv in https://github.com/meta-llama/llama-stack/pull/659
-* Fix Meta reference GPU implementation by @ashwinb in https://github.com/meta-llama/llama-stack/pull/663
-* Fixed imports for inference by @cdgamarose-nv in https://github.com/meta-llama/llama-stack/pull/661
-* fix trace starting in library client by @dineshyv in https://github.com/meta-llama/llama-stack/pull/655
-* Add Llama 70B 3.3 to fireworks by @aidando73 in https://github.com/meta-llama/llama-stack/pull/654
-* Tools API with brave and MCP providers by @dineshyv in https://github.com/meta-llama/llama-stack/pull/639
-* [torchtune integration] post training + eval by @SLR722 in https://github.com/meta-llama/llama-stack/pull/670
-* Fix post training apis broken by torchtune release by @SLR722 in https://github.com/meta-llama/llama-stack/pull/674
-* Add missing venv option in --image-type by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/677
-* Removed unnecessary CONDA_PREFIX env var in installation guide by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/683
-* Add 3.3 70B to Ollama inference provider by @aidando73 in https://github.com/meta-llama/llama-stack/pull/681
-* docs: update evals_reference/index.md by @eltociear in https://github.com/meta-llama/llama-stack/pull/675
-* [remove import *][1/n] clean up import & in apis/* by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/689
-* [bugfix] fix broken vision inference, change serialization for bytes by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/693
-* Minor Quick Start documentation updates. by @derekslager in https://github.com/meta-llama/llama-stack/pull/692
-* [bugfix] fix meta-reference agents w/ safety multiple model loading pytest by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/694
-* [bugfix] fix prompt_adapter interleaved_content_convert_to_raw  by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/696
-* Add missing "inline::" prefix for providers in building_distro.md by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/702
-* Fix failing flake8 E226 check by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/701
-* Add missing newlines before printing the Dockerfile content by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/700
-* Add JSON structured outputs to Ollama Provider by @aidando73 in https://github.com/meta-llama/llama-stack/pull/680
-* [#407] Agents: Avoid calling tools that haven't been explicitly enabled by @aidando73 in https://github.com/meta-llama/llama-stack/pull/637
-* Made changes to readme and pinning to llamastack v0.0.61 by @heyjustinai in https://github.com/meta-llama/llama-stack/pull/624
-* [rag evals][1/n] refactor base scoring fn & data schema check by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/664
-* [Post Training] Fix missing import by @SLR722 in https://github.com/meta-llama/llama-stack/pull/705
-* Import from the right path  by @SLR722 in https://github.com/meta-llama/llama-stack/pull/708
-* [#432] Add Groq Provider - chat completions by @aidando73 in https://github.com/meta-llama/llama-stack/pull/609
-* Change post training run.yaml inference config  by @SLR722 in https://github.com/meta-llama/llama-stack/pull/710
-* [Post training] make validation steps configurable by @SLR722 in https://github.com/meta-llama/llama-stack/pull/715
-* Fix incorrect entrypoint for broken `llama stack run` by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/706
-* Fix assert message and call to completion_request_to_prompt in remote:vllm by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/709
-* Fix Groq invalid self.config reference by @aidando73 in https://github.com/meta-llama/llama-stack/pull/719
-* support llama3.1 8B instruct in post training by @SLR722 in https://github.com/meta-llama/llama-stack/pull/698
-* remove default logger handlers when using libcli with notebook by @dineshyv in https://github.com/meta-llama/llama-stack/pull/718
-* move DataSchemaValidatorMixin into standalone utils by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/720
-* add 3.3 to together inference provider by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/729
-* Update CODEOWNERS - add sixianyi0721 as the owner by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/731
-* fix links for distro by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/733
-* add --version to llama stack CLI & /version endpoint by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/732
-* agents to use tools api by @dineshyv in https://github.com/meta-llama/llama-stack/pull/673
-* Add X-LlamaStack-Client-Version, rename ProviderData -> Provider-Data by @ashwinb in https://github.com/meta-llama/llama-stack/pull/735
-* Check version incompatibility by @ashwinb in https://github.com/meta-llama/llama-stack/pull/738
-* Add persistence for localfs datasets by @VladOS95-cyber in https://github.com/meta-llama/llama-stack/pull/557
-* Fixed typo in default VLLM_URL in remote-vllm.md by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/723
-* Consolidating Memory tests under client-sdk by @vladimirivic in https://github.com/meta-llama/llama-stack/pull/703
-* Expose LLAMASTACK_PORT in cli.stack.run by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/722
-* remove conflicting default for tool prompt format in chat completion by @dineshyv in https://github.com/meta-llama/llama-stack/pull/742
-* rename LLAMASTACK_PORT to LLAMA_STACK_PORT for consistency with other env vars by @raghotham in https://github.com/meta-llama/llama-stack/pull/744
-* Add inline vLLM inference provider to regression tests and fix regressions by @frreiss in https://github.com/meta-llama/llama-stack/pull/662
-* [CICD] github workflow to push nightly package to testpypi by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/734
-* Replaced zrangebylex method in the range method by @cheesecake100201 in https://github.com/meta-llama/llama-stack/pull/521
-* Improve model download doc by @SLR722 in https://github.com/meta-llama/llama-stack/pull/748
-* Support building UBI9 base container image by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/676
-* update notebook to use new tool defs by @dineshyv in https://github.com/meta-llama/llama-stack/pull/745
-* Add provider data passing for library client by @dineshyv in https://github.com/meta-llama/llama-stack/pull/750
-* [Fireworks] Update model name for Fireworks by @benjibc in https://github.com/meta-llama/llama-stack/pull/753
-* Consolidating Inference tests under client-sdk tests by @vladimirivic in https://github.com/meta-llama/llama-stack/pull/751
-* Consolidating Safety tests from various places under client-sdk by @vladimirivic in https://github.com/meta-llama/llama-stack/pull/699
-* [CI/CD] more robust re-try for downloading testpypi package by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/749
-* [#432] Add Groq Provider - tool calls by @aidando73 in https://github.com/meta-llama/llama-stack/pull/630
-* Rename ipython to tool by @ashwinb in https://github.com/meta-llama/llama-stack/pull/756
-* Fix incorrect Python binary path for UBI9 image by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/757
-* Update Cerebras docs to include header by @henrytwo in https://github.com/meta-llama/llama-stack/pull/704
-* Add init files to post training folders by @SLR722 in https://github.com/meta-llama/llama-stack/pull/711
-* Switch to use importlib instead of deprecated pkg_resources by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/678
-* [bugfix] fix streaming GeneratorExit exception with LlamaStackAsLibraryClient by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/760
-* Fix telemetry to work on reinstantiating new lib cli by @dineshyv in https://github.com/meta-llama/llama-stack/pull/761
-* [post training]  define llama stack post training dataset format by @SLR722 in https://github.com/meta-llama/llama-stack/pull/717
-* add braintrust to experimental-post-training template by @SLR722 in https://github.com/meta-llama/llama-stack/pull/763
-* added support of PYPI_VERSION in stack build by @jeffxtang in https://github.com/meta-llama/llama-stack/pull/762
-* Fix broken tests in test_registry by @vladimirivic in https://github.com/meta-llama/llama-stack/pull/707
-* Fix fireworks run-with-safety template by @vladimirivic in https://github.com/meta-llama/llama-stack/pull/766
-* Free up memory after post training finishes by @SLR722 in https://github.com/meta-llama/llama-stack/pull/770
-* Fix issue when generating distros by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/755
-* Convert `SamplingParams.strategy` to a union by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/767
-* [CICD] Github workflow for publishing Docker images by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/764
-* [bugfix] fix llama guard parsing ContentDelta by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/772
-* rebase eval test w/ tool_runtime fixtures by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/773
-* More idiomatic REST API by @dineshyv in https://github.com/meta-llama/llama-stack/pull/765
-* add nvidia distribution by @cdgamarose-nv in https://github.com/meta-llama/llama-stack/pull/565
-* bug fixes on inference tests by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/774
-* [bugfix] fix inference sdk test for v1 by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/775
-* fix routing in library client by @dineshyv in https://github.com/meta-llama/llama-stack/pull/776
-* [bugfix] fix client-sdk tests for v1 by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/777
-* fix nvidia inference provider by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/781
-* Make notebook testable by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/780
-* Fix telemetry by @dineshyv in https://github.com/meta-llama/llama-stack/pull/787
-* fireworks add completion logprobs adapter by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/778
-* Idiomatic REST API: Inspect by @dineshyv in https://github.com/meta-llama/llama-stack/pull/779
-* Idiomatic REST API: Evals by @dineshyv in https://github.com/meta-llama/llama-stack/pull/782
-* Add notebook testing to nightly build job by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/785
-* [test automation] support run tests on config file  by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/730
-* Idiomatic REST API: Telemetry by @dineshyv in https://github.com/meta-llama/llama-stack/pull/786
-* Make llama stack build not create a new conda by default by @ashwinb in https://github.com/meta-llama/llama-stack/pull/788
-* REST API fixes by @dineshyv in https://github.com/meta-llama/llama-stack/pull/789
-* fix cerebras template by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/790
-* [Test automation] generate custom test report by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/739
-* cerebras template update for memory by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/792
-* Pin torchtune pkg version by @SLR722 in https://github.com/meta-llama/llama-stack/pull/791
-* fix the code execution test in sdk tests by @dineshyv in https://github.com/meta-llama/llama-stack/pull/794
-* add default toolgroups to all providers by @dineshyv in https://github.com/meta-llama/llama-stack/pull/795
-* Fix tgi adapter by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/796
-* Remove llama-guard in Cerebras template & improve agent test by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/798
-* meta reference inference fixes by @ashwinb in https://github.com/meta-llama/llama-stack/pull/797
-* fix provider model list test by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/800
-* fix playground for v1 by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/799
-* fix eval notebook & add test to workflow by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/803
-* add json_schema_type to ParamType deps by @dineshyv in https://github.com/meta-llama/llama-stack/pull/808
-* Fixing small typo in quick start guide by @pmccarthy in https://github.com/meta-llama/llama-stack/pull/807
-* cannot import name 'GreedySamplingStrategy' by @aidando73 in https://github.com/meta-llama/llama-stack/pull/806
-* optional api dependencies by @ashwinb in https://github.com/meta-llama/llama-stack/pull/793
-* fix vllm template by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/813
-* More generic image type for OCI-compliant container technologies by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/802
-* add mcp runtime as default to all providers by @dineshyv in https://github.com/meta-llama/llama-stack/pull/816
-* fix vllm base64 image inference by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/815
-* fix again vllm for non base64 by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/818
-* Fix incorrect RunConfigSettings due to the removal of conda_env by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/801
-* Fix incorrect image type in publish-to-docker workflow by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/819
-* test report for v0.1 by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/814
-* [CICD] add simple test step for docker build workflow, fix prefix bug by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/821
-* add section for mcp tool usage in notebook by @dineshyv in https://github.com/meta-llama/llama-stack/pull/831
-* [ez] structured output for /completion ollama & enable tests by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/822
-* add pytest option to generate a functional report for distribution by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/833
-* bug fix for distro report generation by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/836
-* [memory refactor][1/n] Rename Memory -> VectorIO, MemoryBanks -> VectorDBs by @ashwinb in https://github.com/meta-llama/llama-stack/pull/828
-* [memory refactor][2/n] Update faiss and make it pass tests by @ashwinb in https://github.com/meta-llama/llama-stack/pull/830
-* [memory refactor][3/n] Introduce RAGToolRuntime as a specialized sub-protocol by @ashwinb in https://github.com/meta-llama/llama-stack/pull/832
-* [memory refactor][4/n] Update the client-sdk test for RAG by @ashwinb in https://github.com/meta-llama/llama-stack/pull/834
-* [memory refactor][5/n] Migrate all vector_io providers by @ashwinb in https://github.com/meta-llama/llama-stack/pull/835
-* [memory refactor][6/n] Update naming and routes by @ashwinb in https://github.com/meta-llama/llama-stack/pull/839
-* Fix fireworks client sdk chat completion with images by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/840
-* [inference api] modify content types so they follow a more standard structure by @ashwinb in https://github.com/meta-llama/llama-stack/pull/841
-* fix experimental-post-training template by @SLR722 in https://github.com/meta-llama/llama-stack/pull/842
-* Improved report generation for providers by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/844
-* [client sdk test] add options for inference_model, safety_shield, embedding_model by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/843
-* add distro report by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/847
-* Update Documentation by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/838
-* Update OpenAPI generator to output discriminator by @ashwinb in https://github.com/meta-llama/llama-stack/pull/848
-* update docs for tools and telemetry by @dineshyv in https://github.com/meta-llama/llama-stack/pull/846
-* Add vLLM raw completions API by @aidando73 in https://github.com/meta-llama/llama-stack/pull/823
-* update doc for client-sdk testing  by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/849
-* Delete docs/to_situate directory by @raghotham in https://github.com/meta-llama/llama-stack/pull/851
-* Fixed distro documentation by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/852
-* remove getting started notebook by @dineshyv in https://github.com/meta-llama/llama-stack/pull/853
-* More Updates to Read the Docs  by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/856
-* Llama_Stack_Building_AI_Applications.ipynb -> getting_started.ipynb by @dineshyv in https://github.com/meta-llama/llama-stack/pull/854
-* update docs for adding new API providers by @dineshyv in https://github.com/meta-llama/llama-stack/pull/855
-* Add Runpod Provider + Distribution by @pandyamarut in https://github.com/meta-llama/llama-stack/pull/362
-* Sambanova inference provider by @snova-edwardm in https://github.com/meta-llama/llama-stack/pull/555
-* Updates to ReadTheDocs by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/859
-* sync readme.md to index.md by @dineshyv in https://github.com/meta-llama/llama-stack/pull/860
-* More updates to ReadTheDocs by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/861
-* make default tool prompt format none in agent config by @dineshyv in https://github.com/meta-llama/llama-stack/pull/863
-* update the client reference by @dineshyv in https://github.com/meta-llama/llama-stack/pull/864
-* update python sdk reference by @dineshyv in https://github.com/meta-llama/llama-stack/pull/866
-* remove logger handler only in notebook by @dineshyv in https://github.com/meta-llama/llama-stack/pull/868
-* Update 'first RAG agent' in gettingstarted doc by @ehhuang in https://github.com/meta-llama/llama-stack/pull/867
-
-## New Contributors
-* @cdgamarose-nv made their first contribution in https://github.com/meta-llama/llama-stack/pull/661
-* @eltociear made their first contribution in https://github.com/meta-llama/llama-stack/pull/675
-* @derekslager made their first contribution in https://github.com/meta-llama/llama-stack/pull/692
-* @VladOS95-cyber made their first contribution in https://github.com/meta-llama/llama-stack/pull/557
-* @frreiss made their first contribution in https://github.com/meta-llama/llama-stack/pull/662
-* @pmccarthy made their first contribution in https://github.com/meta-llama/llama-stack/pull/807
-* @pandyamarut made their first contribution in https://github.com/meta-llama/llama-stack/pull/362
-* @snova-edwardm made their first contribution in https://github.com/meta-llama/llama-stack/pull/555
-* @ehhuang made their first contribution in https://github.com/meta-llama/llama-stack/pull/867
-
-**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.63...v0.1.0
+We are excited to announce a stable API release of Llama Stack, which enables developers to build RAG applications and Agents using tools and safety shields, monitor and those agents with telemetry, and evaluate the agent with scoring functions.
+
+## Context
+GenAI application developers need more than just an LLM - they need to integrate tools, connect with their data sources, establish guardrails, and ground the LLM responses effectively. Currently, developers must piece together various tools and APIs, complicating the development lifecycle and increasing costs. The result is that developers are spending more time on these integrations rather than focusing on the application logic itself. The bespoke coupling of components also makes it challenging to adopt state-of-the-art solutions in the rapidly evolving GenAI space. This is particularly difficult for open models like Llama, as best practices are not widely established in the open.
+
+Llama Stack was created to provide developers with a comprehensive and coherent interface that simplifies AI application development and codifies best practices across the Llama ecosystem. Since our launch in September 2024, we have seen a huge uptick in interest in Llama Stack APIs by both AI developers and from partners building AI services with Llama models. Partners like Nvidia, Fireworks, and Ollama have collaborated with us to develop implementations across various APIs, including inference, memory, and safety.
+
+With Llama Stack, you can easily build a RAG agent which can also search the web, do complex math, and custom tool calling. You can use telemetry to inspect those traces, and convert telemetry into evals datasets. And with Llama Stack’s plugin architecture and prepackage distributions, you choose to run your agent anywhere - in the cloud with our partners, deploy your own environment using virtualenv, conda, or Docker, operate locally with Ollama, or even run on mobile devices with our SDKs. Llama Stack offers unprecedented flexibility while also simplifying the developer experience.
+
+## Release
+After iterating on the APIs for the last 3 months, today we’re launching a stable release (V1) of the Llama Stack APIs and the corresponding llama-stack server and client packages(v0.1.0). We now have automated tests for providers. These tests make sure that all provider implementations are verified. Developers can now easily and reliably select distributions or providers based on their specific requirements.
+
+There are example standalone apps in llama-stack-apps.
+
+
+## Key Features of this release
+
+- **Unified API Layer**
+  - Inference: Run LLM models
+  - RAG: Store and retrieve knowledge for RAG
+  - Agents: Build multi-step agentic workflows
+  - Tools: Register tools that can be called by the agent
+  - Safety: Apply content filtering and safety policies
+  - Evaluation: Test model and agent quality
+  - Telemetry: Collect and analyze usage data and complex agentic traces
+  - Post Training ( Coming Soon ): Fine tune models for specific use cases
+
+- **Rich Provider Ecosystem**
+  - Local Development: Meta's Reference, Ollama
+  - Cloud: Fireworks, Together, Nvidia, AWS Bedrock, Groq, Cerebras
+  - On-premises: Nvidia NIM, vLLM, TGI, Dell-TGI
+  - On-device: iOS and Android support
+
+- **Built for Production**
+  - Pre-packaged distributions for common deployment scenarios
+  - Backwards compatibility across model versions
+  - Comprehensive evaluation capabilities
+  - Full observability and monitoring
+
+- **Multiple developer interfaces**
+  - CLI: Command line interface
+  - Python SDK
+  - Swift iOS SDK
+  - Kotlin Android SDK
+
+- **Sample llama stack applications**
+  - Python
+  - iOS
+  - Android
+
+
 
 ---
 
 # v0.1.0rc12
 Published on: 2025-01-22T22:24:01Z
 
-## What's Changed
-* [4/n][torchtune integration] support lazy load model during inference by @SLR722 in https://github.com/meta-llama/llama-stack/pull/620
-* remove unused telemetry related code for console by @dineshyv in https://github.com/meta-llama/llama-stack/pull/659
-* Fix Meta reference GPU implementation by @ashwinb in https://github.com/meta-llama/llama-stack/pull/663
-* Fixed imports for inference by @cdgamarose-nv in https://github.com/meta-llama/llama-stack/pull/661
-* fix trace starting in library client by @dineshyv in https://github.com/meta-llama/llama-stack/pull/655
-* Add Llama 70B 3.3 to fireworks by @aidando73 in https://github.com/meta-llama/llama-stack/pull/654
-* Tools API with brave and MCP providers by @dineshyv in https://github.com/meta-llama/llama-stack/pull/639
-* [torchtune integration] post training + eval by @SLR722 in https://github.com/meta-llama/llama-stack/pull/670
-* Fix post training apis broken by torchtune release by @SLR722 in https://github.com/meta-llama/llama-stack/pull/674
-* Add missing venv option in --image-type by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/677
-* Removed unnecessary CONDA_PREFIX env var in installation guide by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/683
-* Add 3.3 70B to Ollama inference provider by @aidando73 in https://github.com/meta-llama/llama-stack/pull/681
-* docs: update evals_reference/index.md by @eltociear in https://github.com/meta-llama/llama-stack/pull/675
-* [remove import *][1/n] clean up import & in apis/* by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/689
-* [bugfix] fix broken vision inference, change serialization for bytes by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/693
-* Minor Quick Start documentation updates. by @derekslager in https://github.com/meta-llama/llama-stack/pull/692
-* [bugfix] fix meta-reference agents w/ safety multiple model loading pytest by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/694
-* [bugfix] fix prompt_adapter interleaved_content_convert_to_raw  by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/696
-* Add missing "inline::" prefix for providers in building_distro.md by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/702
-* Fix failing flake8 E226 check by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/701
-* Add missing newlines before printing the Dockerfile content by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/700
-* Add JSON structured outputs to Ollama Provider by @aidando73 in https://github.com/meta-llama/llama-stack/pull/680
-* [#407] Agents: Avoid calling tools that haven't been explicitly enabled by @aidando73 in https://github.com/meta-llama/llama-stack/pull/637
-* Made changes to readme and pinning to llamastack v0.0.61 by @heyjustinai in https://github.com/meta-llama/llama-stack/pull/624
-* [rag evals][1/n] refactor base scoring fn & data schema check by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/664
-* [Post Training] Fix missing import by @SLR722 in https://github.com/meta-llama/llama-stack/pull/705
-* Import from the right path  by @SLR722 in https://github.com/meta-llama/llama-stack/pull/708
-* [#432] Add Groq Provider - chat completions by @aidando73 in https://github.com/meta-llama/llama-stack/pull/609
-* Change post training run.yaml inference config  by @SLR722 in https://github.com/meta-llama/llama-stack/pull/710
-* [Post training] make validation steps configurable by @SLR722 in https://github.com/meta-llama/llama-stack/pull/715
-* Fix incorrect entrypoint for broken `llama stack run` by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/706
-* Fix assert message and call to completion_request_to_prompt in remote:vllm by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/709
-* Fix Groq invalid self.config reference by @aidando73 in https://github.com/meta-llama/llama-stack/pull/719
-* support llama3.1 8B instruct in post training by @SLR722 in https://github.com/meta-llama/llama-stack/pull/698
-* remove default logger handlers when using libcli with notebook by @dineshyv in https://github.com/meta-llama/llama-stack/pull/718
-* move DataSchemaValidatorMixin into standalone utils by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/720
-* add 3.3 to together inference provider by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/729
-* Update CODEOWNERS - add sixianyi0721 as the owner by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/731
-* fix links for distro by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/733
-* add --version to llama stack CLI & /version endpoint by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/732
-* agents to use tools api by @dineshyv in https://github.com/meta-llama/llama-stack/pull/673
-* Add X-LlamaStack-Client-Version, rename ProviderData -> Provider-Data by @ashwinb in https://github.com/meta-llama/llama-stack/pull/735
-* Check version incompatibility by @ashwinb in https://github.com/meta-llama/llama-stack/pull/738
-* Add persistence for localfs datasets by @VladOS95-cyber in https://github.com/meta-llama/llama-stack/pull/557
-* Fixed typo in default VLLM_URL in remote-vllm.md by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/723
-* Consolidating Memory tests under client-sdk by @vladimirivic in https://github.com/meta-llama/llama-stack/pull/703
-* Expose LLAMASTACK_PORT in cli.stack.run by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/722
-* remove conflicting default for tool prompt format in chat completion by @dineshyv in https://github.com/meta-llama/llama-stack/pull/742
-* rename LLAMASTACK_PORT to LLAMA_STACK_PORT for consistency with other env vars by @raghotham in https://github.com/meta-llama/llama-stack/pull/744
-* Add inline vLLM inference provider to regression tests and fix regressions by @frreiss in https://github.com/meta-llama/llama-stack/pull/662
-* [CICD] github workflow to push nightly package to testpypi by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/734
-* Replaced zrangebylex method in the range method by @cheesecake100201 in https://github.com/meta-llama/llama-stack/pull/521
-* Improve model download doc by @SLR722 in https://github.com/meta-llama/llama-stack/pull/748
-* Support building UBI9 base container image by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/676
-* update notebook to use new tool defs by @dineshyv in https://github.com/meta-llama/llama-stack/pull/745
-* Add provider data passing for library client by @dineshyv in https://github.com/meta-llama/llama-stack/pull/750
-* [Fireworks] Update model name for Fireworks by @benjibc in https://github.com/meta-llama/llama-stack/pull/753
-* Consolidating Inference tests under client-sdk tests by @vladimirivic in https://github.com/meta-llama/llama-stack/pull/751
-* Consolidating Safety tests from various places under client-sdk by @vladimirivic in https://github.com/meta-llama/llama-stack/pull/699
-* [CI/CD] more robust re-try for downloading testpypi package by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/749
-* [#432] Add Groq Provider - tool calls by @aidando73 in https://github.com/meta-llama/llama-stack/pull/630
-* Rename ipython to tool by @ashwinb in https://github.com/meta-llama/llama-stack/pull/756
-* Fix incorrect Python binary path for UBI9 image by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/757
-* Update Cerebras docs to include header by @henrytwo in https://github.com/meta-llama/llama-stack/pull/704
-* Add init files to post training folders by @SLR722 in https://github.com/meta-llama/llama-stack/pull/711
-* Switch to use importlib instead of deprecated pkg_resources by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/678
-* [bugfix] fix streaming GeneratorExit exception with LlamaStackAsLibraryClient by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/760
-* Fix telemetry to work on reinstantiating new lib cli by @dineshyv in https://github.com/meta-llama/llama-stack/pull/761
-* [post training]  define llama stack post training dataset format by @SLR722 in https://github.com/meta-llama/llama-stack/pull/717
-* add braintrust to experimental-post-training template by @SLR722 in https://github.com/meta-llama/llama-stack/pull/763
-* added support of PYPI_VERSION in stack build by @jeffxtang in https://github.com/meta-llama/llama-stack/pull/762
-* Fix broken tests in test_registry by @vladimirivic in https://github.com/meta-llama/llama-stack/pull/707
-* Fix fireworks run-with-safety template by @vladimirivic in https://github.com/meta-llama/llama-stack/pull/766
-* Free up memory after post training finishes by @SLR722 in https://github.com/meta-llama/llama-stack/pull/770
-* Fix issue when generating distros by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/755
-* Convert `SamplingParams.strategy` to a union by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/767
-* [CICD] Github workflow for publishing Docker images by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/764
-* [bugfix] fix llama guard parsing ContentDelta by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/772
-* rebase eval test w/ tool_runtime fixtures by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/773
-* More idiomatic REST API by @dineshyv in https://github.com/meta-llama/llama-stack/pull/765
-* add nvidia distribution by @cdgamarose-nv in https://github.com/meta-llama/llama-stack/pull/565
-* bug fixes on inference tests by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/774
-* [bugfix] fix inference sdk test for v1 by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/775
-* fix routing in library client by @dineshyv in https://github.com/meta-llama/llama-stack/pull/776
-* [bugfix] fix client-sdk tests for v1 by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/777
-* fix nvidia inference provider by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/781
-* Make notebook testable by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/780
-* Fix telemetry by @dineshyv in https://github.com/meta-llama/llama-stack/pull/787
-* fireworks add completion logprobs adapter by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/778
-* Idiomatic REST API: Inspect by @dineshyv in https://github.com/meta-llama/llama-stack/pull/779
-* Idiomatic REST API: Evals by @dineshyv in https://github.com/meta-llama/llama-stack/pull/782
-* Add notebook testing to nightly build job by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/785
-* [test automation] support run tests on config file  by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/730
-* Idiomatic REST API: Telemetry by @dineshyv in https://github.com/meta-llama/llama-stack/pull/786
-* Make llama stack build not create a new conda by default by @ashwinb in https://github.com/meta-llama/llama-stack/pull/788
-* REST API fixes by @dineshyv in https://github.com/meta-llama/llama-stack/pull/789
-* fix cerebras template by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/790
-* [Test automation] generate custom test report by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/739
-* cerebras template update for memory by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/792
-* Pin torchtune pkg version by @SLR722 in https://github.com/meta-llama/llama-stack/pull/791
-* fix the code execution test in sdk tests by @dineshyv in https://github.com/meta-llama/llama-stack/pull/794
-* add default toolgroups to all providers by @dineshyv in https://github.com/meta-llama/llama-stack/pull/795
-* Fix tgi adapter by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/796
-* Remove llama-guard in Cerebras template & improve agent test by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/798
-* meta reference inference fixes by @ashwinb in https://github.com/meta-llama/llama-stack/pull/797
-* fix provider model list test by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/800
-* fix playground for v1 by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/799
-* fix eval notebook & add test to workflow by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/803
-* add json_schema_type to ParamType deps by @dineshyv in https://github.com/meta-llama/llama-stack/pull/808
-* Fixing small typo in quick start guide by @pmccarthy in https://github.com/meta-llama/llama-stack/pull/807
-* cannot import name 'GreedySamplingStrategy' by @aidando73 in https://github.com/meta-llama/llama-stack/pull/806
-* optional api dependencies by @ashwinb in https://github.com/meta-llama/llama-stack/pull/793
-* fix vllm template by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/813
-* More generic image type for OCI-compliant container technologies by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/802
-* add mcp runtime as default to all providers by @dineshyv in https://github.com/meta-llama/llama-stack/pull/816
-* fix vllm base64 image inference by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/815
-* fix again vllm for non base64 by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/818
-* Fix incorrect RunConfigSettings due to the removal of conda_env by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/801
-* Fix incorrect image type in publish-to-docker workflow by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/819
-* test report for v0.1 by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/814
-* [CICD] add simple test step for docker build workflow, fix prefix bug by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/821
-* add section for mcp tool usage in notebook by @dineshyv in https://github.com/meta-llama/llama-stack/pull/831
-* [ez] structured output for /completion ollama & enable tests by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/822
-* add pytest option to generate a functional report for distribution by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/833
-* bug fix for distro report generation by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/836
-* [memory refactor][1/n] Rename Memory -> VectorIO, MemoryBanks -> VectorDBs by @ashwinb in https://github.com/meta-llama/llama-stack/pull/828
-* [memory refactor][2/n] Update faiss and make it pass tests by @ashwinb in https://github.com/meta-llama/llama-stack/pull/830
-* [memory refactor][3/n] Introduce RAGToolRuntime as a specialized sub-protocol by @ashwinb in https://github.com/meta-llama/llama-stack/pull/832
-* [memory refactor][4/n] Update the client-sdk test for RAG by @ashwinb in https://github.com/meta-llama/llama-stack/pull/834
-* [memory refactor][5/n] Migrate all vector_io providers by @ashwinb in https://github.com/meta-llama/llama-stack/pull/835
-* [memory refactor][6/n] Update naming and routes by @ashwinb in https://github.com/meta-llama/llama-stack/pull/839
-* Fix fireworks client sdk chat completion with images by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/840
-* [inference api] modify content types so they follow a more standard structure by @ashwinb in https://github.com/meta-llama/llama-stack/pull/841
 
-## New Contributors
-* @cdgamarose-nv made their first contribution in https://github.com/meta-llama/llama-stack/pull/661
-* @eltociear made their first contribution in https://github.com/meta-llama/llama-stack/pull/675
-* @derekslager made their first contribution in https://github.com/meta-llama/llama-stack/pull/692
-* @VladOS95-cyber made their first contribution in https://github.com/meta-llama/llama-stack/pull/557
-* @frreiss made their first contribution in https://github.com/meta-llama/llama-stack/pull/662
-* @pmccarthy made their first contribution in https://github.com/meta-llama/llama-stack/pull/807
-
-**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.63...v0.1.0rc11
 
 ---
 
 # v0.0.63
 Published on: 2024-12-18T07:17:43Z
 
-A small but important bug-fix release to update the URL datatype for the client-SDKs. The issue affected multimodal agentic turns especially.
-
+A small but important bug-fix release to update the URL datatype for the client-SDKs. The issue affected multimodal agentic turns especially.
+
 **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.62...v0.0.63
 
 ---
@@ -853,439 +205,67 @@ A small but important bug-fix release to update the URL datatype for the client-
 # v0.0.62
 Published on: 2024-12-18T02:39:43Z
 
-## What's Changed
 
-A few important updates some of which are backwards incompatible. You must update your `run.yaml`s when upgrading. As always look to `templates/<distro>/run.yaml` for reference.
-
-* Make embedding generation go through inference by @dineshyv in https://github.com/meta-llama/llama-stack/pull/606
-* [/scoring] add ability to define aggregation functions for scoring functions & refactors by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/597
-* Update the "InterleavedTextMedia" type  by @ashwinb in https://github.com/meta-llama/llama-stack/pull/635
-* [NEW!] Experimental post-training APIs! https://github.com/meta-llama/llama-stack/pull/540,  https://github.com/meta-llama/llama-stack/pull/593, etc.
-
-A variety of fixes and enhancements. Some selected ones:
-
-* [#342] RAG - fix PDF format in vector database by @aidando73 in https://github.com/meta-llama/llama-stack/pull/551
-* add completion api support to nvidia inference provider by @mattf in https://github.com/meta-llama/llama-stack/pull/533
-* add model type to APIs by @dineshyv in https://github.com/meta-llama/llama-stack/pull/588
-* Allow using an "inline" version of Chroma using PersistentClient by @ashwinb in https://github.com/meta-llama/llama-stack/pull/567
-* [docs] add playground ui docs by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/592
-* add colab notebook & update docs by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/619
-* [tests] add client-sdk pytests & delete client.py by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/638
-* [bugfix] no shield_call when there's no shields configured by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/642
-
-## New Contributors
-* @SLR722 made their first contribution in https://github.com/meta-llama/llama-stack/pull/540
-* @iamarunbrahma made their first contribution in https://github.com/meta-llama/llama-stack/pull/636
-
-**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.61...v0.0.62
 
 ---
 
 # v0.0.61
 Published on: 2024-12-10T20:50:33Z
 
-## What's Changed
-* add NVIDIA NIM inference adapter by @mattf in https://github.com/meta-llama/llama-stack/pull/355
-* Tgi fixture by @dineshyv in https://github.com/meta-llama/llama-stack/pull/519
-* fixes tests & move braintrust api_keys to request headers by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/535
-* allow env NVIDIA_BASE_URL to set NVIDIAConfig.url by @mattf in https://github.com/meta-llama/llama-stack/pull/531
-* move playground ui to llama-stack repo by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/536
-* fix[documentation]: Update links to point to correct pages by @sablair in https://github.com/meta-llama/llama-stack/pull/549
-* Fix URLs to Llama Stack Read the Docs Webpages by @JeffreyLind3 in https://github.com/meta-llama/llama-stack/pull/547
-* Fix Zero to Hero README.md Formatting by @JeffreyLind3 in https://github.com/meta-llama/llama-stack/pull/546
-* Guide readme fix by @raghotham in https://github.com/meta-llama/llama-stack/pull/552
-* Fix broken Ollama link by @aidando73 in https://github.com/meta-llama/llama-stack/pull/554
-* update client cli docs by @dineshyv in https://github.com/meta-llama/llama-stack/pull/560
-* reduce the accuracy requirements to pass the chat completion structured output test by @mattf in https://github.com/meta-llama/llama-stack/pull/522
-* removed assertion in ollama.py and fixed typo in the readme by @wukaixingxp in https://github.com/meta-llama/llama-stack/pull/563
-* Cerebras Inference Integration by @henrytwo in https://github.com/meta-llama/llama-stack/pull/265
-* unregister API for dataset  by @sixianyi0721 in https://github.com/meta-llama/llama-stack/pull/507
-* [llama stack ui] add native eval & inspect distro & playground pages by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/541
-* Telemetry API redesign by @dineshyv in https://github.com/meta-llama/llama-stack/pull/525
-* Introduce GitHub Actions Workflow for Llama Stack Tests by @ConnorHack in https://github.com/meta-llama/llama-stack/pull/523
-* specify the client version that works for current together server by @jeffxtang in https://github.com/meta-llama/llama-stack/pull/566
-* remove unused telemetry related code by @dineshyv in https://github.com/meta-llama/llama-stack/pull/570
-* Fix up safety client for versioned API by @stevegrubb in https://github.com/meta-llama/llama-stack/pull/573
-* Add eval/scoring/datasetio API providers to distribution templates & UI developer guide by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/564
-* Add ability to query and export spans to dataset by @dineshyv in https://github.com/meta-llama/llama-stack/pull/574
-* Renames otel config from jaeger to otel by @codefromthecrypt in https://github.com/meta-llama/llama-stack/pull/569
-* add telemetry docs by @dineshyv in https://github.com/meta-llama/llama-stack/pull/572
-* Console span processor improvements by @dineshyv in https://github.com/meta-llama/llama-stack/pull/577
-* doc: quickstart guide errors by @aidando73 in https://github.com/meta-llama/llama-stack/pull/575
-* Add kotlin docs by @Riandy in https://github.com/meta-llama/llama-stack/pull/568
-* Update android_sdk.md by @Riandy in https://github.com/meta-llama/llama-stack/pull/578
-* Bump kotlin docs to 0.0.54.1 by @Riandy in https://github.com/meta-llama/llama-stack/pull/579
-* Make LlamaStackLibraryClient work correctly by @ashwinb in https://github.com/meta-llama/llama-stack/pull/581
-* Update integration type for Cerebras to hosted by @henrytwo in https://github.com/meta-llama/llama-stack/pull/583
-* Use customtool's get_tool_definition to remove duplication by @jeffxtang in https://github.com/meta-llama/llama-stack/pull/584
-* [#391] Add support for json structured output for vLLM by @aidando73 in https://github.com/meta-llama/llama-stack/pull/528
-* Fix Jaeger instructions by @yurishkuro in https://github.com/meta-llama/llama-stack/pull/580
-* fix telemetry import by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/585
-* update template run.yaml to include openai api key for braintrust by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/590
-* add tracing to library client by @dineshyv in https://github.com/meta-llama/llama-stack/pull/591
-* Fixes for library client by @ashwinb in https://github.com/meta-llama/llama-stack/pull/587
-* Fix issue 586 by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/594
 
-## New Contributors
-* @sablair made their first contribution in https://github.com/meta-llama/llama-stack/pull/549
-* @JeffreyLind3 made their first contribution in https://github.com/meta-llama/llama-stack/pull/547
-* @aidando73 made their first contribution in https://github.com/meta-llama/llama-stack/pull/554
-* @henrytwo made their first contribution in https://github.com/meta-llama/llama-stack/pull/265
-* @sixianyi0721 made their first contribution in https://github.com/meta-llama/llama-stack/pull/507
-* @ConnorHack made their first contribution in https://github.com/meta-llama/llama-stack/pull/523
-* @yurishkuro made their first contribution in https://github.com/meta-llama/llama-stack/pull/580
-
-**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.55...v0.0.61
 
 ---
 
 # v0.0.55
 Published on: 2024-11-23T17:14:07Z
 
-## What's Changed
-* Fix TGI inference adapter
-* Fix `llama stack build` in 0.0.54 by @dltn in https://github.com/meta-llama/llama-stack/pull/505
-* Several documentation related improvements
-* Fix opentelemetry adapter by @dineshyv in https://github.com/meta-llama/llama-stack/pull/510
-* Update Ollama supported llama model list by @hickeyma in https://github.com/meta-llama/llama-stack/pull/483
 
-**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.54...v0.0.55
 
 ---
 
 # v0.0.54
 Published on: 2024-11-22T00:36:09Z
 
-## What's Changed
-* Bugfixes release on top of 0.0.53
-* Don't depend on templates.py when print llama stack build messages by @ashwinb in https://github.com/meta-llama/llama-stack/pull/496
-* Restructure docs by @dineshyv in https://github.com/meta-llama/llama-stack/pull/494
-* Since we are pushing for HF repos, we should accept them in inference configs by @ashwinb in https://github.com/meta-llama/llama-stack/pull/497
-* Fix fp8 quantization script. by @liyunlu0618 in https://github.com/meta-llama/llama-stack/pull/500
-* use logging instead of prints by @dineshyv in https://github.com/meta-llama/llama-stack/pull/499
 
-## New Contributors
-* @liyunlu0618 made their first contribution in https://github.com/meta-llama/llama-stack/pull/500
-
-**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.53...v0.0.54
 
 ---
 
 # v0.0.53
 Published on: 2024-11-20T22:18:00Z
 
-🚀  Initial Release Notes for Llama Stack!
-
-### Added
-- Resource-oriented design for models, shields, memory banks, datasets and eval tasks
-- Persistence for registered objects with distribution
-- Ability to persist memory banks created for FAISS
-- PostgreSQL KVStore implementation
-- Environment variable placeholder support in run.yaml files
-- Comprehensive Zero-to-Hero notebooks and quickstart guides
-- Support for quantized models in Ollama
-- Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM
-- Bedrock distribution with safety shields support
-- Evals API with task registration and scoring functions
-- MMLU and SimpleQA benchmark scoring functions
-- Huggingface dataset provider integration for benchmarks
-- Support for custom dataset registration from local paths
-- Benchmark evaluation CLI tools with visualization tables
-- RAG evaluation scoring functions and metrics
-- Local persistence for datasets and eval tasks
-
-### Changed
-- Split safety into distinct providers (llama-guard, prompt-guard, code-scanner)
-- Changed provider naming convention (`impls` → `inline`, `adapters` → `remote`)
-- Updated API signatures for dataset and eval task registration
-- Restructured folder organization for providers
-- Enhanced Docker build configuration
-- Added version prefixing for REST API routes
-- Enhanced evaluation task registration workflow
-- Improved benchmark evaluation output formatting
-- Restructured evals folder organization for better modularity
-
-### Removed
-- `llama stack configure` command
-
-## What's Changed
-* Update download command by @Wauplin in https://github.com/meta-llama/llama-stack/pull/9
-* Update fbgemm version by @jianyuh in https://github.com/meta-llama/llama-stack/pull/12
-* Add CLI reference docs by @dltn in https://github.com/meta-llama/llama-stack/pull/14
-* Added Ollama as an inference impl  by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/20
-* Hide older models by @dltn in https://github.com/meta-llama/llama-stack/pull/23
-* Introduce Llama stack distributions by @ashwinb in https://github.com/meta-llama/llama-stack/pull/22
-* Rename inline -> local by @dltn in https://github.com/meta-llama/llama-stack/pull/24
-* Avoid using nearly double the memory needed by @ashwinb in https://github.com/meta-llama/llama-stack/pull/30
-* Updates to prompt for tool calls by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/29
-* RFC-0001-The-Llama-Stack by @raghotham in https://github.com/meta-llama/llama-stack/pull/8
-* Add API keys to AgenticSystemConfig instead of relying on dotenv by @ashwinb in https://github.com/meta-llama/llama-stack/pull/33
-* update cli ref doc by @jeffxtang in https://github.com/meta-llama/llama-stack/pull/34
-* fixed bug in download not enough disk space condition by @sisminnmaw in https://github.com/meta-llama/llama-stack/pull/35
-* Updated cli instructions with additonal details for each subcommands by @varunfb in https://github.com/meta-llama/llama-stack/pull/36
-* Updated URLs and addressed feedback by @varunfb in https://github.com/meta-llama/llama-stack/pull/37
-* Fireworks basic integration by @benjibc in https://github.com/meta-llama/llama-stack/pull/39
-* Together AI basic integration by @Nutlope in https://github.com/meta-llama/llama-stack/pull/43
-* Update LICENSE by @raghotham in https://github.com/meta-llama/llama-stack/pull/47
-* Add patch for SSE event endpoint responses by @dltn in https://github.com/meta-llama/llama-stack/pull/50
-* API Updates: fleshing out RAG APIs, introduce "llama stack" CLI command by @ashwinb in https://github.com/meta-llama/llama-stack/pull/51
-* [inference] Add a TGI adapter by @ashwinb in https://github.com/meta-llama/llama-stack/pull/52
-* upgrade llama_models by @benjibc in https://github.com/meta-llama/llama-stack/pull/55
-* Query generators for RAG query by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/54
-* Add Chroma and PGVector adapters by @ashwinb in https://github.com/meta-llama/llama-stack/pull/56
-* API spec update, client demo with Stainless SDK by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/58
-* Enable Bing search by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/59
-* add safety to openapi spec by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/62
-* Add config file based CLI by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/60
-* Simplified Telemetry API and tying it to logger by @ashwinb in https://github.com/meta-llama/llama-stack/pull/57
-* [Inference] Use huggingface_hub inference client for TGI adapter by @hanouticelina in https://github.com/meta-llama/llama-stack/pull/53
-* Support `data:` in URL for memory. Add ootb support for pdfs by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/67
-* Remove request wrapper migration by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/64
-* CLI Update: build -> configure -> run by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/69
-* API Updates by @ashwinb in https://github.com/meta-llama/llama-stack/pull/73
-* Unwrap ChatCompletionRequest for context_retriever  by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/75
-* CLI - add back build wizard, configure with name instead of build.yaml by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/74
-* CLI: add build templates support, move imports by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/77
-* fix prompt with name args by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/80
-* Fix memory URL parsing by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/81
-* Allow TGI adaptor to have non-standard llama model names by @hardikjshah in https://github.com/meta-llama/llama-stack/pull/84
-* [API Updates] Model / shield / memory-bank routing + agent persistence + support for private headers by @ashwinb in https://github.com/meta-llama/llama-stack/pull/92
-* Bedrock Guardrails comiting after rebasing the fork by @rsgrewal-aws in https://github.com/meta-llama/llama-stack/pull/96
-* Bedrock Inference Integration by @poegej in https://github.com/meta-llama/llama-stack/pull/94
-* Support for Llama3.2 models and Swift SDK by @ashwinb in https://github.com/meta-llama/llama-stack/pull/98
-* fix safety using inference by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/99
-* Fixes typo for setup instruction for starting Llama Stack Server section  by @abhishekmishragithub in https://github.com/meta-llama/llama-stack/pull/103
-* Make TGI adapter compatible with HF Inference API by @Wauplin in https://github.com/meta-llama/llama-stack/pull/97
-* Fix links & format by @machina-source in https://github.com/meta-llama/llama-stack/pull/104
-* docs: fix typo by @dijonkitchen in https://github.com/meta-llama/llama-stack/pull/107
-* LG safety fix by @kplawiak in https://github.com/meta-llama/llama-stack/pull/108
-* Minor typos, HuggingFace -> Hugging Face by @marklysze in https://github.com/meta-llama/llama-stack/pull/113
-* Reordered pip install and llama model download by @KarthiDreamr in https://github.com/meta-llama/llama-stack/pull/112
-* Update getting_started.ipynb by @delvingdeep in https://github.com/meta-llama/llama-stack/pull/117
-* fix: 404 link to agentic system repository by @moldhouse in https://github.com/meta-llama/llama-stack/pull/118
-* Fix broken links in RFC-0001-llama-stack.md by @bhimrazy in https://github.com/meta-llama/llama-stack/pull/134
-* Validate `name` in `llama stack build` by @russellb in https://github.com/meta-llama/llama-stack/pull/128
-* inference: Fix download command in error msg by @russellb in https://github.com/meta-llama/llama-stack/pull/133
-* configure: Fix a error msg typo by @russellb in https://github.com/meta-llama/llama-stack/pull/131
-* docs: Note how to use podman by @russellb in https://github.com/meta-llama/llama-stack/pull/130
-* add env for LLAMA_STACK_CONFIG_DIR by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/137
-* [bugfix] fix duplicate api endpoints by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/139
-* Use inference APIs for executing Llama Guard by @ashwinb in https://github.com/meta-llama/llama-stack/pull/121
-* fixing safety inference and safety adapter for new API spec. Pinned t… by @yogishbaliga in https://github.com/meta-llama/llama-stack/pull/105
-* [CLI] remove dependency on CONDA_PREFIX in CLI by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/144
-* [bugfix] fix #146 by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/147
-* Extract provider data properly (attempt 2) by @ashwinb in https://github.com/meta-llama/llama-stack/pull/148
-* `is_multimodal` accepts `core_model_id`  not model itself. by @wizardbc in https://github.com/meta-llama/llama-stack/pull/153
-* fix broken bedrock inference provider by @moritalous in https://github.com/meta-llama/llama-stack/pull/151
-* Fix podman+selinux compatibility by @russellb in https://github.com/meta-llama/llama-stack/pull/132
-* docker: Install in editable mode for dev purposes by @russellb in https://github.com/meta-llama/llama-stack/pull/160
-* [CLI] simplify docker run by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/159
-* Add a RoutableProvider protocol, support for multiple routing keys by @ashwinb in https://github.com/meta-llama/llama-stack/pull/163
-* docker: Check for selinux before using `--security-opt` by @russellb in https://github.com/meta-llama/llama-stack/pull/167
-* Adds markdown-link-check and fixes a broken link by @codefromthecrypt in https://github.com/meta-llama/llama-stack/pull/165
-* [bugfix] conda path lookup by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/179
-* fix prompt guard by @ashwinb in https://github.com/meta-llama/llama-stack/pull/177
-* inference: Add model option to client by @russellb in https://github.com/meta-llama/llama-stack/pull/170
-* [CLI] avoid configure twice by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/171
-* Check that the model is found before use. by @AshleyT3 in https://github.com/meta-llama/llama-stack/pull/182
-* Add 'url' property to Redis KV config by @Minutis in https://github.com/meta-llama/llama-stack/pull/192
-* Inline vLLM inference provider by @russellb in https://github.com/meta-llama/llama-stack/pull/181
-* add databricks provider by @prithu-dasgupta in https://github.com/meta-llama/llama-stack/pull/83
-* add Weaviate memory adapter by @zainhas in https://github.com/meta-llama/llama-stack/pull/95
-* download: improve help text by @russellb in https://github.com/meta-llama/llama-stack/pull/204
-* Fix ValueError in case chunks are empty by @Minutis in https://github.com/meta-llama/llama-stack/pull/206
-* refactor docs by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/209
-* README.md: Add vLLM to providers table by @russellb in https://github.com/meta-llama/llama-stack/pull/207
-* Add .idea to .gitignore by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/216
-* [bugfix] Fix logprobs on meta-reference impl by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/213
-* Add classifiers in setup.py by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/217
-* Add function for stopping inference by @kebbbnnn in https://github.com/meta-llama/llama-stack/pull/224
-* JSON serialization for parallel processing queue by @dltn in https://github.com/meta-llama/llama-stack/pull/232
-* Remove "routing_table" and "routing_key" concepts for the user by @ashwinb in https://github.com/meta-llama/llama-stack/pull/201
-* ci: Run pre-commit checks in CI by @russellb in https://github.com/meta-llama/llama-stack/pull/176
-* Fix incorrect completion() signature for Databricks provider by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/236
-* Enable pre-commit on main branch by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/237
-* Switch to pre-commit/action by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/239
-* Remove request arg from chat completion response processing by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/240
-* Fix broken rendering in Google Colab by @frntn in https://github.com/meta-llama/llama-stack/pull/247
-* Docker compose scripts for remote adapters by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/241
-* Update getting_started.md by @MeDott29 in https://github.com/meta-llama/llama-stack/pull/260
-* Add llama download support for multiple models with comma-separated list by @tamdogood in https://github.com/meta-llama/llama-stack/pull/261
-* config templates restructure, docs by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/262
-* [bugfix] fix case for agent when memory bank registered without specifying provider_id by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/264
-* Add an option to not use elastic agents for meta-reference inference by @ashwinb in https://github.com/meta-llama/llama-stack/pull/269
-* Make all methods `async def` again; add completion() for meta-reference by @ashwinb in https://github.com/meta-llama/llama-stack/pull/270
-* Add vLLM inference provider for OpenAI compatible vLLM server by @terrytangyuan in https://github.com/meta-llama/llama-stack/pull/178
-* Update event_logger.py by @nehal-a2z in https://github.com/meta-llama/llama-stack/pull/275
-* llama stack distributions / templates / docker refactor by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/266
-* add more distro templates by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/279
-* first version of readthedocs by @raghotham in https://github.com/meta-llama/llama-stack/pull/278
-* add completion() for ollama by @dineshyv in https://github.com/meta-llama/llama-stack/pull/280
-* [Evals API] [1/n] Initial API by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/287
-* Add REST api example for chat_completion by @subramen in https://github.com/meta-llama/llama-stack/pull/286
-* feat: Qdrant Vector index support by @Anush008 in https://github.com/meta-llama/llama-stack/pull/221
-* Add support for Structured Output / Guided decoding by @ashwinb in https://github.com/meta-llama/llama-stack/pull/281
-* [bug] Fix import conflict for SamplingParams by @subramen in https://github.com/meta-llama/llama-stack/pull/285
-* Added implementations for get_agents_session, delete_agents_session and delete_agents by @cheesecake100201 in https://github.com/meta-llama/llama-stack/pull/267
-* [Evals API][2/n] datasets / datasetio meta-reference implementation by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/288
-* Added tests for persistence by @cheesecake100201 in https://github.com/meta-llama/llama-stack/pull/274
-* Support structured output for Together by @ashwinb in https://github.com/meta-llama/llama-stack/pull/289
-* dont set num_predict for all providers by @dineshyv in https://github.com/meta-llama/llama-stack/pull/294
-* Fix issue w/ routing_table api getting added when router api is not specified by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/298
-* New quantized models by @ashwinb in https://github.com/meta-llama/llama-stack/pull/301
-* [Evals API][3/n] scoring_functions / scoring meta-reference implementations by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/296
-* completion() for tgi by @dineshyv in https://github.com/meta-llama/llama-stack/pull/295
-* [enhancement] added templates and enhanced readme by @heyjustinai in https://github.com/meta-llama/llama-stack/pull/307
-* Fix for get_agents_session by @cheesecake100201 in https://github.com/meta-llama/llama-stack/pull/300
-* fix broken --list-templates with adding build.yaml files for packaging by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/327
-* Added hadamard transform for spinquant by @sacmehta in https://github.com/meta-llama/llama-stack/pull/326
-* [Evals API][4/n] evals with generation meta-reference impl by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/303
-* completion() for together by @dineshyv in https://github.com/meta-llama/llama-stack/pull/324
-* completion() for fireworks by @dineshyv in https://github.com/meta-llama/llama-stack/pull/329
-* [Evals API][6/n] meta-reference llm as judge, registration for ScoringFnDefs by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/330
-* update distributions compose/readme by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/338
-* distro readmes with model serving instructions by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/339
-* [Evals API][7/n] braintrust scoring provider by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/333
-* Kill --name from llama stack build by @ashwinb in https://github.com/meta-llama/llama-stack/pull/340
-* Do not cache pip by @stevegrubb in https://github.com/meta-llama/llama-stack/pull/349
-* add dynamic clients for all APIs by @ashwinb in https://github.com/meta-llama/llama-stack/pull/348
-* fix bedrock impl by @dineshyv in https://github.com/meta-llama/llama-stack/pull/359
-* [docs] update documentations by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/356
-* pgvector fixes by @dineshyv in https://github.com/meta-llama/llama-stack/pull/369
-* persist registered objects with distribution by @dineshyv in https://github.com/meta-llama/llama-stack/pull/354
-* Significantly simpler and malleable test setup by @ashwinb in https://github.com/meta-llama/llama-stack/pull/360
-* Correct a traceback in vllm by @stevegrubb in https://github.com/meta-llama/llama-stack/pull/366
-* add postgres kvstoreimpl by @dineshyv in https://github.com/meta-llama/llama-stack/pull/374
-* add ability to persist memory banks created for faiss by @dineshyv in https://github.com/meta-llama/llama-stack/pull/375
-* fix postgres config validation by @dineshyv in https://github.com/meta-llama/llama-stack/pull/380
-* Enable vision models for (Together, Fireworks, Meta-Reference, Ollama) by @ashwinb in https://github.com/meta-llama/llama-stack/pull/376
-* Kill `llama stack configure` by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/371
-* fix routing tables look up key for memory bank by @dineshyv in https://github.com/meta-llama/llama-stack/pull/383
-* add bedrock distribution code by @dineshyv in https://github.com/meta-llama/llama-stack/pull/358
-* Enable remote::vllm by @ashwinb in https://github.com/meta-llama/llama-stack/pull/384
-* Directory rename: `providers/impls` -> `providers/inline`, `providers/adapters` -> `providers/remote` by @ashwinb in https://github.com/meta-llama/llama-stack/pull/381
-* fix safety signature mismatch by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/388
-* Remove the safety adapter for Together; we can just use "meta-reference" by @ashwinb in https://github.com/meta-llama/llama-stack/pull/387
-* [LlamaStack][Fireworks] Update client and add unittest by @benjibc in https://github.com/meta-llama/llama-stack/pull/390
-* [bugfix] fix together data validator by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/393
-* Add provider deprecation support; change directory structure by @ashwinb in https://github.com/meta-llama/llama-stack/pull/397
-* Factor out create_dist_registry by @dltn in https://github.com/meta-llama/llama-stack/pull/398
-* [docs] refactor remote-hosted distro by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/402
-* [Evals API][10/n] API updates for EvalTaskDef + new test migration by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/379
-* Resource oriented design for shields by @dineshyv in https://github.com/meta-llama/llama-stack/pull/399
-* Add pip install helper for test and direct scenarios by @dltn in https://github.com/meta-llama/llama-stack/pull/404
-* migrate model to Resource and new registration signature by @dineshyv in https://github.com/meta-llama/llama-stack/pull/410
-* [Docs] Zero-to-Hero notebooks and quick start documentation by @heyjustinai in https://github.com/meta-llama/llama-stack/pull/368
-* Distributions updates (slight updates to ollama, add inline-vllm and remote-vllm) by @ashwinb in https://github.com/meta-llama/llama-stack/pull/408
-* added quickstart w ollama and toolcalling using together by @heyjustinai in https://github.com/meta-llama/llama-stack/pull/413
-* Split safety into (llama-guard, prompt-guard, code-scanner) by @ashwinb in https://github.com/meta-llama/llama-stack/pull/400
-* fix duplicate `deploy` in  compose.yaml by @subramen in https://github.com/meta-llama/llama-stack/pull/417
-* [Evals API][11/n] huggingface dataset provider + mmlu scoring fn by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/392
-* Folder restructure for evals/datasets/scoring by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/419
-* migrate memory banks to Resource and new registration by @dineshyv in https://github.com/meta-llama/llama-stack/pull/411
-* migrate dataset to resource by @dineshyv in https://github.com/meta-llama/llama-stack/pull/420
-* migrate evals to resource by @dineshyv in https://github.com/meta-llama/llama-stack/pull/421
-* migrate scoring fns to resource by @dineshyv in https://github.com/meta-llama/llama-stack/pull/422
-* Rename all inline providers with an inline:: prefix by @ashwinb in https://github.com/meta-llama/llama-stack/pull/423
-* fix tests after registration migration & rename meta-reference -> basic / llm_as_judge provider by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/424
-* fix eval task registration by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/426
-* fix fireworks data validator by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/427
-* Allow specifying resources in StackRunConfig by @ashwinb in https://github.com/meta-llama/llama-stack/pull/425
-* Enable sane naming of registered objects with defaults by @ashwinb in https://github.com/meta-llama/llama-stack/pull/429
-* Remove the "ShieldType" concept by @ashwinb in https://github.com/meta-llama/llama-stack/pull/430
-* Inference to use provider resource id to register and validate by @dineshyv in https://github.com/meta-llama/llama-stack/pull/428
-* Kill "remote" providers and fix testing with a remote stack properly by @ashwinb in https://github.com/meta-llama/llama-stack/pull/435
-* add inline:: prefix for localfs provider by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/441
-* change schema -> dataset_schema for Dataset class by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/442
-* change schema -> dataset_schema for register_dataset api by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/443
-* PR-437-Fixed bug to allow system instructions after first turn by @cheesecake100201 in https://github.com/meta-llama/llama-stack/pull/440
-* add support for ${env.FOO_BAR} placeholders in run.yaml files by @ashwinb in https://github.com/meta-llama/llama-stack/pull/439
-* model registration in ollama and vllm check against the available models in the provider by @dineshyv in https://github.com/meta-llama/llama-stack/pull/446
-* Added link to the Colab notebook of the Llama Stack lesson on the Llama 3.2 course on DLAI by @jeffxtang in https://github.com/meta-llama/llama-stack/pull/445
-* make distribution registry thread safe and other fixes by @dineshyv in https://github.com/meta-llama/llama-stack/pull/449
-* local persistent for hf dataset provider by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/451
-* Support model resource updates and deletes by @dineshyv in https://github.com/meta-llama/llama-stack/pull/452
-* init registry once by @dineshyv in https://github.com/meta-llama/llama-stack/pull/450
-* local persistence for eval tasks by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/453
-* Fix build configure deprecation message by @hickeyma in https://github.com/meta-llama/llama-stack/pull/456
-* Support parallel downloads for `llama model download` by @ashwinb in https://github.com/meta-llama/llama-stack/pull/448
-* Add a verify-download command to llama CLI by @ashwinb in https://github.com/meta-llama/llama-stack/pull/457
-* unregister for memory banks and remove update API by @dineshyv in https://github.com/meta-llama/llama-stack/pull/458
-* move hf addapter->remote by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/459
-* await initialize in faiss by @dineshyv in https://github.com/meta-llama/llama-stack/pull/463
-* fix faiss serialize and serialize of index by @dineshyv in https://github.com/meta-llama/llama-stack/pull/464
-* Extend shorthand support for the `llama stack run` command by @vladimirivic in https://github.com/meta-llama/llama-stack/pull/465
-* [Agentic Eval] add ability to run agents generation by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/469
-* Auto-generate distro yamls + docs  by @ashwinb in https://github.com/meta-llama/llama-stack/pull/468
-* Allow models to be registered as long as llama model is provided by @dineshyv in https://github.com/meta-llama/llama-stack/pull/472
-* get stack run config based on template name by @dineshyv in https://github.com/meta-llama/llama-stack/pull/477
-* add quantized model ollama support by @wukaixingxp in https://github.com/meta-llama/llama-stack/pull/471
-* Update kotlin client docs by @Riandy in https://github.com/meta-llama/llama-stack/pull/476
-* remove pydantic namespace warnings using model_config by @mattf in https://github.com/meta-llama/llama-stack/pull/470
-* fix llama stack build for together & llama stack build from templates by @yanxi0830 in https://github.com/meta-llama/llama-stack/pull/479
-* Add version to REST API url by @ashwinb in https://github.com/meta-llama/llama-stack/pull/478
-* support adding alias for models without hf repo/sku entry by @dineshyv in https://github.com/meta-llama/llama-stack/pull/481
-* update quick start to have the working instruction by @chuenlok in https://github.com/meta-llama/llama-stack/pull/467
-* add changelog by @dineshyv in https://github.com/meta-llama/llama-stack/pull/487
-* Added optional md5 validate command once download is completed by @varunfb in https://github.com/meta-llama/llama-stack/pull/486
-* Support Tavily as built-in search tool. by @iseeyuan in https://github.com/meta-llama/llama-stack/pull/485
-* Reorganizing Zero to Hero Folder structure by @heyjustinai in https://github.com/meta-llama/llama-stack/pull/447
-* fall to back to read from chroma/pgvector when not in cache by @dineshyv in https://github.com/meta-llama/llama-stack/pull/489
-* register with provider even if present in stack by @dineshyv in https://github.com/meta-llama/llama-stack/pull/491
-* Make run yaml optional so dockers can start with just --env by @ashwinb in https://github.com/meta-llama/llama-stack/pull/492
-
-## New Contributors
-* @Wauplin made their first contribution in https://github.com/meta-llama/llama-stack/pull/9
-* @jianyuh made their first contribution in https://github.com/meta-llama/llama-stack/pull/12
-* @dltn made their first contribution in https://github.com/meta-llama/llama-stack/pull/14
-* @hardikjshah made their first contribution in https://github.com/meta-llama/llama-stack/pull/20
-* @raghotham made their first contribution in https://github.com/meta-llama/llama-stack/pull/8
-* @jeffxtang made their first contribution in https://github.com/meta-llama/llama-stack/pull/34
-* @sisminnmaw made their first contribution in https://github.com/meta-llama/llama-stack/pull/35
-* @varunfb made their first contribution in https://github.com/meta-llama/llama-stack/pull/36
-* @benjibc made their first contribution in https://github.com/meta-llama/llama-stack/pull/39
-* @Nutlope made their first contribution in https://github.com/meta-llama/llama-stack/pull/43
-* @hanouticelina made their first contribution in https://github.com/meta-llama/llama-stack/pull/53
-* @rsgrewal-aws made their first contribution in https://github.com/meta-llama/llama-stack/pull/96
-* @poegej made their first contribution in https://github.com/meta-llama/llama-stack/pull/94
-* @abhishekmishragithub made their first contribution in https://github.com/meta-llama/llama-stack/pull/103
-* @machina-source made their first contribution in https://github.com/meta-llama/llama-stack/pull/104
-* @dijonkitchen made their first contribution in https://github.com/meta-llama/llama-stack/pull/107
-* @marklysze made their first contribution in https://github.com/meta-llama/llama-stack/pull/113
-* @KarthiDreamr made their first contribution in https://github.com/meta-llama/llama-stack/pull/112
-* @delvingdeep made their first contribution in https://github.com/meta-llama/llama-stack/pull/117
-* @moldhouse made their first contribution in https://github.com/meta-llama/llama-stack/pull/118
-* @bhimrazy made their first contribution in https://github.com/meta-llama/llama-stack/pull/134
-* @russellb made their first contribution in https://github.com/meta-llama/llama-stack/pull/128
-* @yogishbaliga made their first contribution in https://github.com/meta-llama/llama-stack/pull/105
-* @wizardbc made their first contribution in https://github.com/meta-llama/llama-stack/pull/153
-* @moritalous made their first contribution in https://github.com/meta-llama/llama-stack/pull/151
-* @codefromthecrypt made their first contribution in https://github.com/meta-llama/llama-stack/pull/165
-* @AshleyT3 made their first contribution in https://github.com/meta-llama/llama-stack/pull/182
-* @Minutis made their first contribution in https://github.com/meta-llama/llama-stack/pull/192
-* @prithu-dasgupta made their first contribution in https://github.com/meta-llama/llama-stack/pull/83
-* @zainhas made their first contribution in https://github.com/meta-llama/llama-stack/pull/95
-* @terrytangyuan made their first contribution in https://github.com/meta-llama/llama-stack/pull/216
-* @kebbbnnn made their first contribution in https://github.com/meta-llama/llama-stack/pull/224
-* @frntn made their first contribution in https://github.com/meta-llama/llama-stack/pull/247
-* @MeDott29 made their first contribution in https://github.com/meta-llama/llama-stack/pull/260
-* @tamdogood made their first contribution in https://github.com/meta-llama/llama-stack/pull/261
-* @nehal-a2z made their first contribution in https://github.com/meta-llama/llama-stack/pull/275
-* @dineshyv made their first contribution in https://github.com/meta-llama/llama-stack/pull/280
-* @subramen made their first contribution in https://github.com/meta-llama/llama-stack/pull/286
-* @Anush008 made their first contribution in https://github.com/meta-llama/llama-stack/pull/221
-* @cheesecake100201 made their first contribution in https://github.com/meta-llama/llama-stack/pull/267
-* @heyjustinai made their first contribution in https://github.com/meta-llama/llama-stack/pull/307
-* @sacmehta made their first contribution in https://github.com/meta-llama/llama-stack/pull/326
-* @stevegrubb made their first contribution in https://github.com/meta-llama/llama-stack/pull/349
-* @hickeyma made their first contribution in https://github.com/meta-llama/llama-stack/pull/456
-* @vladimirivic made their first contribution in https://github.com/meta-llama/llama-stack/pull/465
-* @wukaixingxp made their first contribution in https://github.com/meta-llama/llama-stack/pull/471
-* @Riandy made their first contribution in https://github.com/meta-llama/llama-stack/pull/476
-* @mattf made their first contribution in https://github.com/meta-llama/llama-stack/pull/470
-* @chuenlok made their first contribution in https://github.com/meta-llama/llama-stack/pull/467
-* @iseeyuan made their first contribution in https://github.com/meta-llama/llama-stack/pull/485
-
-**Full Changelog**: https://github.com/meta-llama/llama-stack/commits/v0.0.53
+🚀  Initial Release Notes for Llama Stack!
+
+### Added
+- Resource-oriented design for models, shields, memory banks, datasets and eval tasks
+- Persistence for registered objects with distribution
+- Ability to persist memory banks created for FAISS
+- PostgreSQL KVStore implementation
+- Environment variable placeholder support in run.yaml files
+- Comprehensive Zero-to-Hero notebooks and quickstart guides
+- Support for quantized models in Ollama
+- Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM
+- Bedrock distribution with safety shields support
+- Evals API with task registration and scoring functions
+- MMLU and SimpleQA benchmark scoring functions
+- Huggingface dataset provider integration for benchmarks
+- Support for custom dataset registration from local paths
+- Benchmark evaluation CLI tools with visualization tables
+- RAG evaluation scoring functions and metrics
+- Local persistence for datasets and eval tasks
+
+### Changed
+- Split safety into distinct providers (llama-guard, prompt-guard, code-scanner)
+- Changed provider naming convention (`impls` → `inline`, `adapters` → `remote`)
+- Updated API signatures for dataset and eval task registration
+- Restructured folder organization for providers
+- Enhanced Docker build configuration
+- Added version prefixing for REST API routes
+- Enhanced evaluation task registration workflow
+- Improved benchmark evaluation output formatting
+- Restructured evals folder organization for better modularity
+
+### Removed
+- `llama stack configure` command
+
 
 ---
diff --git a/scripts/gen-changelog.py b/scripts/gen-changelog.py
index 3d5197e03..668146901 100644
--- a/scripts/gen-changelog.py
+++ b/scripts/gen-changelog.py
@@ -4,38 +4,71 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import requests
 import os
 
+import requests
+
+
 def get_all_releases(token):
     url = f"https://api.github.com/repos/meta-llama/llama-stack/releases"
     headers = {"Accept": "application/vnd.github.v3+json"}
-    
+
     if token:
         headers["Authorization"] = f"token {token}"
-    
+
     response = requests.get(url, headers=headers)
-    
+
     if response.status_code == 200:
         return response.json()
     else:
-        raise Exception(f"Error fetching releases: {response.status_code}, {response.text}")
+        raise Exception(
+            f"Error fetching releases: {response.status_code}, {response.text}"
+        )
+
+
+def clean_release_body(body):
+    """Remove '## All changes' sections from release notes."""
+    lines = body.split("\n")
+    cleaned_lines = []
+    skip_mode = False
+
+    for line in lines:
+        if line.strip() in [
+            "## All changes",
+            "### What's Changed",
+            "## What's Changed",
+            "## New Contributors",
+        ]:
+            skip_mode = True
+        elif skip_mode and line.startswith("##"):
+            # Found a new section, stop skipping
+            skip_mode = False
+            cleaned_lines.append(line)
+        elif not skip_mode:
+            cleaned_lines.append(line)
+
+    return "\n".join(cleaned_lines)
 
 
 def merge_release_notes(output_file, token=None):
     releases = get_all_releases(token)
-    
+
     with open(output_file, "w", encoding="utf-8") as md_file:
         md_file.write(f"# Changelog\n\n")
-        
+
         for release in releases:
             md_file.write(f"# {release['tag_name']}\n")
             md_file.write(f"Published on: {release['published_at']}\n\n")
-            md_file.write(f"{release['body']}\n\n")
+
+            # Clean the release body to remove "## All changes" sections
+            cleaned_body = clean_release_body(release["body"])
+            md_file.write(f"{cleaned_body}\n\n")
+
             md_file.write("---\n\n")
-    
+
     print(f"Merged release notes saved to {output_file}")
 
+
 if __name__ == "__main__":
     OUTPUT_FILE = "CHANGELOG.md"
     TOKEN = os.getenv("GITHUB_TOKEN")

From 95060127365a319cd3133a01feca3b19f1588e1a Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 7 Mar 2025 17:31:00 -0500
Subject: [PATCH 28/58] build(deps): bump actions/upload-artifact from 3 to 4
 (#1486)

---
 .github/workflows/gha_workflow_llama_stack_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/gha_workflow_llama_stack_tests.yml b/.github/workflows/gha_workflow_llama_stack_tests.yml
index 89e5edf71..1e94040f7 100644
--- a/.github/workflows/gha_workflow_llama_stack_tests.yml
+++ b/.github/workflows/gha_workflow_llama_stack_tests.yml
@@ -310,7 +310,7 @@ jobs:
       - name: "PR - Upload Test Summary"
         id: pr_test_summary_upload
         if: github.event_name == 'pull_request_target'
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: test-summary
           path: test-summary.md

From d63e798f6d66905a162adf75e44471f4546703eb Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 7 Mar 2025 17:31:53 -0500
Subject: [PATCH 29/58] build(deps): bump
 thollander/actions-comment-pull-request from 2 to 3 (#1485)

---
 .github/workflows/gha_workflow_llama_stack_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/gha_workflow_llama_stack_tests.yml b/.github/workflows/gha_workflow_llama_stack_tests.yml
index 1e94040f7..b10a40974 100644
--- a/.github/workflows/gha_workflow_llama_stack_tests.yml
+++ b/.github/workflows/gha_workflow_llama_stack_tests.yml
@@ -320,7 +320,7 @@ jobs:
       - name: "PR - Update comment"
         id: pr_update_comment
         if: github.event_name == 'pull_request_target'
-        uses: thollander/actions-comment-pull-request@v2
+        uses: thollander/actions-comment-pull-request@v3
         with:
           filePath: test-summary.md
 

From 89e449c2cbb9ac8117d6ede27b5fd0c7f5e8ca35 Mon Sep 17 00:00:00 2001
From: Botao Chen <markchen1015@meta.com>
Date: Fri, 7 Mar 2025 14:49:10 -0800
Subject: [PATCH 30/58] fix: Fix open benchmark template (#1496)

## What does this PR do?
Delete the open_benchmark template which was generated by the auto
codegen by accident
---
 distributions/dependencies.json               |  36 --
 .../templates/open-benchmark/__init__.py      |   7 -
 .../open-benchmark/open_benchmark.py          | 178 ---------
 .../templates/open_benchmark/build.yaml       |  37 --
 llama_stack/templates/open_benchmark/run.yaml | 364 ------------------
 5 files changed, 622 deletions(-)
 delete mode 100644 llama_stack/templates/open-benchmark/__init__.py
 delete mode 100644 llama_stack/templates/open-benchmark/open_benchmark.py
 delete mode 100644 llama_stack/templates/open_benchmark/build.yaml
 delete mode 100644 llama_stack/templates/open_benchmark/run.yaml

diff --git a/distributions/dependencies.json b/distributions/dependencies.json
index 5623e251a..59b0c9e62 100644
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@@ -453,42 +453,6 @@
     "transformers",
     "uvicorn"
   ],
-  "open_benchmark": [
-    "aiosqlite",
-    "autoevals",
-    "blobfile",
-    "chardet",
-    "chromadb-client",
-    "datasets",
-    "fastapi",
-    "fire",
-    "httpx",
-    "litellm",
-    "matplotlib",
-    "mcp",
-    "nltk",
-    "numpy",
-    "openai",
-    "opentelemetry-exporter-otlp-proto-http",
-    "opentelemetry-sdk",
-    "pandas",
-    "pillow",
-    "psycopg2-binary",
-    "pymongo",
-    "pypdf",
-    "redis",
-    "requests",
-    "scikit-learn",
-    "scipy",
-    "sentencepiece",
-    "sqlite-vec",
-    "together",
-    "tqdm",
-    "transformers",
-    "uvicorn",
-    "sentence-transformers --no-deps",
-    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
-  ],
   "remote-vllm": [
     "aiosqlite",
     "autoevals",
diff --git a/llama_stack/templates/open-benchmark/__init__.py b/llama_stack/templates/open-benchmark/__init__.py
deleted file mode 100644
index 14d0a28f5..000000000
--- a/llama_stack/templates/open-benchmark/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .open_benchmark import get_distribution_template  # noqa: F401
diff --git a/llama_stack/templates/open-benchmark/open_benchmark.py b/llama_stack/templates/open-benchmark/open_benchmark.py
deleted file mode 100644
index 9ef84456e..000000000
--- a/llama_stack/templates/open-benchmark/open_benchmark.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import List, Tuple
-
-from llama_stack.distribution.datatypes import (
-    ModelInput,
-    Provider,
-    ShieldInput,
-    ToolGroupInput,
-)
-from llama_stack.providers.inline.vector_io.sqlite_vec.config import SQLiteVectorIOConfig
-from llama_stack.providers.remote.inference.anthropic.config import AnthropicConfig
-from llama_stack.providers.remote.inference.anthropic.models import MODEL_ENTRIES as ANTHROPIC_MODEL_ENTRIES
-from llama_stack.providers.remote.inference.gemini.config import GeminiConfig
-from llama_stack.providers.remote.inference.gemini.models import MODEL_ENTRIES as GEMINI_MODEL_ENTRIES
-from llama_stack.providers.remote.inference.groq.config import GroqConfig
-from llama_stack.providers.remote.inference.groq.models import MODEL_ENTRIES as GROQ_MODEL_ENTRIES
-from llama_stack.providers.remote.inference.openai.config import OpenAIConfig
-from llama_stack.providers.remote.inference.openai.models import MODEL_ENTRIES as OPENAI_MODEL_ENTRIES
-from llama_stack.providers.remote.inference.together.config import TogetherImplConfig
-from llama_stack.providers.remote.inference.together.models import MODEL_ENTRIES as TOGETHER_MODEL_ENTRIES
-from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig
-from llama_stack.providers.remote.vector_io.pgvector.config import PGVectorVectorIOConfig
-from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry
-
-
-def get_inference_providers() -> Tuple[List[Provider], List[ModelInput]]:
-    # in this template, we allow each API key to be optional
-    providers = [
-        (
-            "openai",
-            OPENAI_MODEL_ENTRIES,
-            OpenAIConfig.sample_run_config(api_key="${env.OPENAI_API_KEY:}"),
-        ),
-        (
-            "anthropic",
-            ANTHROPIC_MODEL_ENTRIES,
-            AnthropicConfig.sample_run_config(api_key="${env.ANTHROPIC_API_KEY:}"),
-        ),
-        (
-            "gemini",
-            GEMINI_MODEL_ENTRIES,
-            GeminiConfig.sample_run_config(api_key="${env.GEMINI_API_KEY:}"),
-        ),
-        (
-            "groq",
-            GROQ_MODEL_ENTRIES,
-            GroqConfig.sample_run_config(api_key="${env.GROQ_API_KEY:}"),
-        ),
-        (
-            "together",
-            TOGETHER_MODEL_ENTRIES,
-            TogetherImplConfig.sample_run_config(api_key="${env.TOGETHER_API_KEY:}"),
-        ),
-    ]
-    inference_providers = []
-    available_models = {}
-    for provider_id, model_entries, config in providers:
-        inference_providers.append(
-            Provider(
-                provider_id=provider_id,
-                provider_type=f"remote::{provider_id}",
-                config=config,
-            )
-        )
-        available_models[provider_id] = model_entries
-    return inference_providers, available_models
-
-
-def get_distribution_template() -> DistributionTemplate:
-    inference_providers, available_models = get_inference_providers()
-    providers = {
-        "inference": ([p.provider_type for p in inference_providers] + ["inline::sentence-transformers"]),
-        "vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"],
-        "safety": ["inline::llama-guard"],
-        "agents": ["inline::meta-reference"],
-        "telemetry": ["inline::meta-reference"],
-        "eval": ["inline::meta-reference"],
-        "datasetio": ["remote::huggingface", "inline::localfs"],
-        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
-        "tool_runtime": [
-            "remote::brave-search",
-            "remote::tavily-search",
-            "inline::code-interpreter",
-            "inline::rag-runtime",
-            "remote::model-context-protocol",
-        ],
-    }
-    name = "open_benchmark"
-
-    vector_io_providers = [
-        Provider(
-            provider_id="sqlite-vec",
-            provider_type="inline::sqlite-vec",
-            config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
-        ),
-        Provider(
-            provider_id="${env.ENABLE_CHROMADB+chromadb}",
-            provider_type="remote::chromadb",
-            config=ChromaVectorIOConfig.sample_run_config(url="${env.CHROMADB_URL:}"),
-        ),
-        Provider(
-            provider_id="${env.ENABLE_PGVECTOR+pgvector}",
-            provider_type="remote::pgvector",
-            config=PGVectorVectorIOConfig.sample_run_config(
-                db="${env.PGVECTOR_DB:}",
-                user="${env.PGVECTOR_USER:}",
-                password="${env.PGVECTOR_PASSWORD:}",
-            ),
-        ),
-    ]
-
-    default_tool_groups = [
-        ToolGroupInput(
-            toolgroup_id="builtin::websearch",
-            provider_id="tavily-search",
-        ),
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
-        ToolGroupInput(
-            toolgroup_id="builtin::code_interpreter",
-            provider_id="code-interpreter",
-        ),
-    ]
-
-    default_models = get_model_registry(available_models)
-    return DistributionTemplate(
-        name=name,
-        distro_type="self_hosted",
-        description="Distribution for running open benchmarks",
-        container_image=None,
-        template_path=None,
-        providers=providers,
-        available_models_by_provider=available_models,
-        run_configs={
-            "run.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": inference_providers,
-                    "vector_io": vector_io_providers,
-                },
-                default_models=default_models,
-                default_tool_groups=default_tool_groups,
-                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
-            ),
-        },
-        run_config_env_vars={
-            "LLAMA_STACK_PORT": (
-                "5001",
-                "Port for the Llama Stack distribution server",
-            ),
-            "OPENAI_API_KEY": (
-                "",
-                "OpenAI API Key",
-            ),
-            "GEMINI_API_KEY": (
-                "",
-                "Gemini API Key",
-            ),
-            "GROQ_API_KEY": (
-                "",
-                "Groq API Key",
-            ),
-            "ANTHROPIC_API_KEY": (
-                "",
-                "Anthropic API Key",
-            ),
-            "TOGETHER_API_KEY": (
-                "",
-                "Together API Key",
-            ),
-        },
-    )
diff --git a/llama_stack/templates/open_benchmark/build.yaml b/llama_stack/templates/open_benchmark/build.yaml
deleted file mode 100644
index 367dd1374..000000000
--- a/llama_stack/templates/open_benchmark/build.yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-version: '2'
-distribution_spec:
-  description: Distribution for running open benchmarks
-  providers:
-    inference:
-    - remote::openai
-    - remote::anthropic
-    - remote::gemini
-    - remote::groq
-    - remote::together
-    - inline::sentence-transformers
-    vector_io:
-    - inline::sqlite-vec
-    - remote::chromadb
-    - remote::pgvector
-    safety:
-    - inline::llama-guard
-    agents:
-    - inline::meta-reference
-    telemetry:
-    - inline::meta-reference
-    eval:
-    - inline::meta-reference
-    datasetio:
-    - remote::huggingface
-    - inline::localfs
-    scoring:
-    - inline::basic
-    - inline::llm-as-judge
-    - inline::braintrust
-    tool_runtime:
-    - remote::brave-search
-    - remote::tavily-search
-    - inline::code-interpreter
-    - inline::rag-runtime
-    - remote::model-context-protocol
-image_type: conda
diff --git a/llama_stack/templates/open_benchmark/run.yaml b/llama_stack/templates/open_benchmark/run.yaml
deleted file mode 100644
index e98c2c708..000000000
--- a/llama_stack/templates/open_benchmark/run.yaml
+++ /dev/null
@@ -1,364 +0,0 @@
-version: '2'
-image_name: open_benchmark
-apis:
-- agents
-- datasetio
-- eval
-- inference
-- safety
-- scoring
-- telemetry
-- tool_runtime
-- vector_io
-providers:
-  inference:
-  - provider_id: openai
-    provider_type: remote::openai
-    config:
-      api_key: ${env.OPENAI_API_KEY:}
-  - provider_id: anthropic
-    provider_type: remote::anthropic
-    config:
-      api_key: ${env.ANTHROPIC_API_KEY:}
-  - provider_id: gemini
-    provider_type: remote::gemini
-    config:
-      api_key: ${env.GEMINI_API_KEY:}
-  - provider_id: groq
-    provider_type: remote::groq
-    config:
-      url: https://api.groq.com
-      api_key: ${env.GROQ_API_KEY:}
-  - provider_id: together
-    provider_type: remote::together
-    config:
-      url: https://api.together.xyz/v1
-      api_key: ${env.TOGETHER_API_KEY}
-  vector_io:
-  - provider_id: sqlite-vec
-    provider_type: inline::sqlite-vec
-    config:
-      db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open_benchmark}/sqlite_vec.db
-  - provider_id: ${env.ENABLE_CHROMADB+chromadb}
-    provider_type: remote::chromadb
-    config:
-      url: ${env.CHROMADB_URL:}
-  - provider_id: ${env.ENABLE_PGVECTOR+pgvector}
-    provider_type: remote::pgvector
-    config:
-      host: ${env.PGVECTOR_HOST:localhost}
-      port: ${env.PGVECTOR_PORT:5432}
-      db: ${env.PGVECTOR_DB:}
-      user: ${env.PGVECTOR_USER:}
-      password: ${env.PGVECTOR_PASSWORD:}
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config: {}
-  agents:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        type: sqlite
-        namespace: null
-        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open_benchmark}/agents_store.db
-  telemetry:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config:
-      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
-      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
-      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/open_benchmark/trace_store.db}
-  eval:
-  - provider_id: meta-reference
-    provider_type: inline::meta-reference
-    config: {}
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config: {}
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config: {}
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-    config: {}
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-    config: {}
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:}
-  tool_runtime:
-  - provider_id: brave-search
-    provider_type: remote::brave-search
-    config:
-      api_key: ${env.BRAVE_SEARCH_API_KEY:}
-      max_results: 3
-  - provider_id: tavily-search
-    provider_type: remote::tavily-search
-    config:
-      api_key: ${env.TAVILY_SEARCH_API_KEY:}
-      max_results: 3
-  - provider_id: code-interpreter
-    provider_type: inline::code-interpreter
-    config: {}
-  - provider_id: rag-runtime
-    provider_type: inline::rag-runtime
-    config: {}
-  - provider_id: model-context-protocol
-    provider_type: remote::model-context-protocol
-    config: {}
-metadata_store:
-  type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open_benchmark}/registry.db
-models:
-- metadata: {}
-  model_id: openai/gpt-4o
-  provider_id: openai
-  provider_model_id: openai/gpt-4o
-  model_type: llm
-- metadata: {}
-  model_id: openai/gpt-4o-mini
-  provider_id: openai
-  provider_model_id: openai/gpt-4o-mini
-  model_type: llm
-- metadata: {}
-  model_id: openai/chatgpt-4o-latest
-  provider_id: openai
-  provider_model_id: openai/chatgpt-4o-latest
-  model_type: llm
-- metadata:
-    embedding_dimension: 1536
-    context_length: 8192
-  model_id: openai/text-embedding-3-small
-  provider_id: openai
-  provider_model_id: openai/text-embedding-3-small
-  model_type: embedding
-- metadata:
-    embedding_dimension: 3072
-    context_length: 8192
-  model_id: openai/text-embedding-3-large
-  provider_id: openai
-  provider_model_id: openai/text-embedding-3-large
-  model_type: embedding
-- metadata: {}
-  model_id: anthropic/claude-3-5-sonnet-latest
-  provider_id: anthropic
-  provider_model_id: anthropic/claude-3-5-sonnet-latest
-  model_type: llm
-- metadata: {}
-  model_id: anthropic/claude-3-7-sonnet-latest
-  provider_id: anthropic
-  provider_model_id: anthropic/claude-3-7-sonnet-latest
-  model_type: llm
-- metadata: {}
-  model_id: anthropic/claude-3-5-haiku-latest
-  provider_id: anthropic
-  provider_model_id: anthropic/claude-3-5-haiku-latest
-  model_type: llm
-- metadata:
-    embedding_dimension: 1024
-    context_length: 32000
-  model_id: anthropic/voyage-3
-  provider_id: anthropic
-  provider_model_id: anthropic/voyage-3
-  model_type: embedding
-- metadata:
-    embedding_dimension: 512
-    context_length: 32000
-  model_id: anthropic/voyage-3-lite
-  provider_id: anthropic
-  provider_model_id: anthropic/voyage-3-lite
-  model_type: embedding
-- metadata:
-    embedding_dimension: 1024
-    context_length: 32000
-  model_id: anthropic/voyage-code-3
-  provider_id: anthropic
-  provider_model_id: anthropic/voyage-code-3
-  model_type: embedding
-- metadata: {}
-  model_id: gemini/gemini-1.5-flash
-  provider_id: gemini
-  provider_model_id: gemini/gemini-1.5-flash
-  model_type: llm
-- metadata: {}
-  model_id: gemini/gemini-1.5-pro
-  provider_id: gemini
-  provider_model_id: gemini/gemini-1.5-pro
-  model_type: llm
-- metadata:
-    embedding_dimension: 768
-    context_length: 2048
-  model_id: gemini/text-embedding-004
-  provider_id: gemini
-  provider_model_id: gemini/text-embedding-004
-  model_type: embedding
-- metadata: {}
-  model_id: groq/llama3-8b-8192
-  provider_id: groq
-  provider_model_id: groq/llama3-8b-8192
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.1-8B-Instruct
-  provider_id: groq
-  provider_model_id: groq/llama3-8b-8192
-  model_type: llm
-- metadata: {}
-  model_id: groq/llama-3.1-8b-instant
-  provider_id: groq
-  provider_model_id: groq/llama-3.1-8b-instant
-  model_type: llm
-- metadata: {}
-  model_id: groq/llama3-70b-8192
-  provider_id: groq
-  provider_model_id: groq/llama3-70b-8192
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3-70B-Instruct
-  provider_id: groq
-  provider_model_id: groq/llama3-70b-8192
-  model_type: llm
-- metadata: {}
-  model_id: groq/llama-3.3-70b-versatile
-  provider_id: groq
-  provider_model_id: groq/llama-3.3-70b-versatile
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.3-70B-Instruct
-  provider_id: groq
-  provider_model_id: groq/llama-3.3-70b-versatile
-  model_type: llm
-- metadata: {}
-  model_id: groq/llama-3.2-3b-preview
-  provider_id: groq
-  provider_model_id: groq/llama-3.2-3b-preview
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-3B-Instruct
-  provider_id: groq
-  provider_model_id: groq/llama-3.2-3b-preview
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo
-  provider_id: together
-  provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.1-8B-Instruct
-  provider_id: together
-  provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo
-  provider_id: together
-  provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.1-70B-Instruct
-  provider_id: together
-  provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
-  provider_id: together
-  provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.1-405B-Instruct-FP8
-  provider_id: together
-  provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo
-  provider_id: together
-  provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-3B-Instruct
-  provider_id: together
-  provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo
-  provider_id: together
-  provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
-  provider_id: together
-  provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
-  provider_id: together
-  provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
-  provider_id: together
-  provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo
-  provider_id: together
-  provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-3.3-70B-Instruct
-  provider_id: together
-  provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Meta-Llama-Guard-3-8B
-  provider_id: together
-  provider_model_id: meta-llama/Meta-Llama-Guard-3-8B
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-Guard-3-8B
-  provider_id: together
-  provider_model_id: meta-llama/Meta-Llama-Guard-3-8B
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo
-  provider_id: together
-  provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo
-  model_type: llm
-- metadata: {}
-  model_id: meta-llama/Llama-Guard-3-11B-Vision
-  provider_id: together
-  provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo
-  model_type: llm
-- metadata:
-    embedding_dimension: 768
-    context_length: 8192
-  model_id: togethercomputer/m2-bert-80M-8k-retrieval
-  provider_id: together
-  provider_model_id: togethercomputer/m2-bert-80M-8k-retrieval
-  model_type: embedding
-- metadata:
-    embedding_dimension: 768
-    context_length: 32768
-  model_id: togethercomputer/m2-bert-80M-32k-retrieval
-  provider_id: together
-  provider_model_id: togethercomputer/m2-bert-80M-32k-retrieval
-  model_type: embedding
-shields:
-- shield_id: meta-llama/Llama-Guard-3-8B
-vector_dbs: []
-datasets: []
-scoring_fns: []
-benchmarks: []
-tool_groups:
-- toolgroup_id: builtin::websearch
-  provider_id: tavily-search
-- toolgroup_id: builtin::rag
-  provider_id: rag-runtime
-- toolgroup_id: builtin::code_interpreter
-  provider_id: code-interpreter
-server:
-  port: 8321

From ade76e4a69e679c88742f25d1dd0e99636e48ede Mon Sep 17 00:00:00 2001
From: Botao Chen <markchen1015@meta.com>
Date: Fri, 7 Mar 2025 15:05:27 -0800
Subject: [PATCH 31/58] fix: update the open benchmark eval doc (#1497)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What does this PR do?
add proper links to the doc

## test
preview the doc

<img width="1304" alt="Screenshot 2025-03-07 at 3 03 22 PM"
src="https://github.com/user-attachments/assets/0a0e2a3d-2420-4af0-99c3-a4786855fae0"
/>

<img width="1303" alt="Screenshot 2025-03-07 at 3 03 32 PM"
src="https://github.com/user-attachments/assets/e11844e7-ee8a-4a64-8617-abafa02b2868"
/>
---
 docs/source/concepts/evaluation_concepts.md     | 2 +-
 docs/source/references/evals_reference/index.md | 2 +-
 llama_stack/templates/open-benchmark/run.yaml   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/concepts/evaluation_concepts.md b/docs/source/concepts/evaluation_concepts.md
index 61a695d9f..abe5898b6 100644
--- a/docs/source/concepts/evaluation_concepts.md
+++ b/docs/source/concepts/evaluation_concepts.md
@@ -37,7 +37,7 @@ The list of open-benchmarks we currently support:
 - [MMMU](https://arxiv.org/abs/2311.16502) (A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI)]: Benchmark designed to evaluate multimodal models.
 
 
-You can follow this contributing guidance to add more open-benchmarks to Llama Stack
+You can follow this [contributing guide](https://llama-stack.readthedocs.io/en/latest/references/evals_reference/index.html#open-benchmark-contributing-guide) to add more open-benchmarks to Llama Stack
 
 ### Run evaluation on open-benchmarks via CLI
 
diff --git a/docs/source/references/evals_reference/index.md b/docs/source/references/evals_reference/index.md
index d55537c47..c10becc7d 100644
--- a/docs/source/references/evals_reference/index.md
+++ b/docs/source/references/evals_reference/index.md
@@ -372,7 +372,7 @@ The purpose of scoring function is to calculate the score for each example based
 Firstly, you can see if the existing [llama stack scoring functions](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/scoring) can fulfill your need. If not, you need to write a new scoring function based on what benchmark author / other open source repo describe.
 
 ### Add new benchmark into template
-Firstly, you need to add the evaluation dataset associated with your benchmark under `datasets` resource in templates/open-benchmark/run.yaml
+Firstly, you need to add the evaluation dataset associated with your benchmark under `datasets` resource in the [open-benchmark](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/templates/open-benchmark/run.yaml)
 
 Secondly, you need to add the new benchmark you just created under the `benchmarks` resource in the same template. To add the new benchmark, you need to have
 - `benchmark_id`: identifier of the benchmark
diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml
index ba495923c..47a2f2eb5 100644
--- a/llama_stack/templates/open-benchmark/run.yaml
+++ b/llama_stack/templates/open-benchmark/run.yaml
@@ -1,5 +1,5 @@
 version: '2'
-image_name: dev
+image_name: open-benchmark
 apis:
 - agents
 - datasetio

From 23e39cc3c496f711bc9e7ca52e396f4542a61b81 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Fri, 7 Mar 2025 15:58:26 -0800
Subject: [PATCH 32/58] fix: handle log errors (#1499)

Summary:
| File
"/Users/erichuang/projects/llama-stack/llama_stack/distribution/server/server.py",
line 213, in sse_generator
    |     logger.exception(f"Error in sse_generator: {e}")
| File
"/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/logging/__init__.py",
line 1864, in exception
    |     self.log(ERROR, msg, *args, exc_info=exc_info, **kwargs)
| File
"/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/logging/__init__.py",
line 1879, in log
    |     self.logger.log(level, msg, *args, **kwargs)
| File
"/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/logging/__init__.py",
line 1547, in log
    |     self._log(level, msg, args, **kwargs)
| File
"/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/logging/__init__.py",
line 1624, in _log
    |     self.handle(record)
| File
"/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/logging/__init__.py",
line 1634, in handle
    |     self.callHandlers(record)
| File
"/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/logging/__init__.py",
line 1696, in callHandlers
    |     hdlr.handle(record)
| File
"/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/logging/__init__.py",
line 968, in handle
    |     self.emit(record)
| File
"/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/site-packages/rich/logging.py",
line 167, in emit
    |     message_renderable = self.render_message(record, message)
| File
"/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/site-packages/rich/logging.py",
line 193, in render_message
| message_text = Text.from_markup(message) if use_markup else
Text(message)
| File
"/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/site-packages/rich/text.py",
line 287, in from_markup
| rendered_text = render(text, style, emoji=emoji,
emoji_variant=emoji_variant)
| File
"/opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.10/site-packages/rich/markup.py",
line 167, in render
    |     raise MarkupError(
| rich.errors.MarkupError: closing tag '[/INST]' at position 105 doesn't
match any open tag


Test Plan:
reran failing rag_with_vector_db example
---
 llama_stack/log.py                                  | 13 +++++++++++++
 .../inline/agents/meta_reference/agent_instance.py  |  3 +--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/llama_stack/log.py b/llama_stack/log.py
index 11aa1bf7e..481385974 100644
--- a/llama_stack/log.py
+++ b/llama_stack/log.py
@@ -11,6 +11,7 @@ from typing import Dict
 
 from rich.console import Console
 from rich.logging import RichHandler
+from rich.errors import MarkupError
 
 # Default log level
 DEFAULT_LOG_LEVEL = logging.INFO
@@ -82,6 +83,18 @@ class CustomRichHandler(RichHandler):
         kwargs["console"] = Console(width=120)
         super().__init__(*args, **kwargs)
 
+    def emit(self, record):
+        """Override emit to handle markup errors gracefully."""
+        try:
+            super().emit(record)
+        except MarkupError:
+            original_markup = self.markup
+            self.markup = False
+            try:
+                super().emit(record)
+            finally:
+                self.markup = original_markup
+
 
 def setup_logging(category_levels: Dict[str, int]) -> None:
     """
diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
index b7cba4e46..3619b3f67 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@@ -16,7 +16,6 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
 from urllib.parse import urlparse
 
 import httpx
-from rich.markup import escape
 
 from llama_stack.apis.agents import (
     AgentConfig,
@@ -1030,7 +1029,7 @@ async def execute_tool_call_maybe(
             **toolgroup_args.get(group_name, {}),
         },
     )
-    logger.info(f"tool call {name} completed with result: {escape(str(result))}")
+    logger.info(f"tool call {name} completed with result: {result}")
     return result
 
 
From c4e527b21c103fab7b0887236620d7cd37841c6c Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Sat, 8 Mar 2025 00:25:40 +0000
Subject: [PATCH 33/58] Bump version to 0.1.6

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index fb3065ced..077214354 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "llama_stack"
-version = "0.1.5"
+version = "0.1.6"
 authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
 description = "Llama Stack"
 readme = "README.md"
@@ -26,7 +26,7 @@ dependencies = [
     "httpx",
     "huggingface-hub",
     "jsonschema",
-    "llama-stack-client>=0.1.5",
+    "llama-stack-client>=0.1.6",
     "prompt-toolkit",
     "python-dotenv",
     "pydantic>=2",

From 0db3a2f511c1e4d5017cbede8e095303397d8d7a Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Fri, 7 Mar 2025 16:31:42 -0800
Subject: [PATCH 34/58] fix: run pre-commit due to release script bumps

---
 llama_stack/log.py |  2 +-
 requirements.txt   |  2 +-
 uv.lock            | 10 +++++-----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llama_stack/log.py b/llama_stack/log.py
index 481385974..175427f5c 100644
--- a/llama_stack/log.py
+++ b/llama_stack/log.py
@@ -10,8 +10,8 @@ from logging.config import dictConfig
 from typing import Dict
 
 from rich.console import Console
-from rich.logging import RichHandler
 from rich.errors import MarkupError
+from rich.logging import RichHandler
 
 # Default log level
 DEFAULT_LOG_LEVEL = logging.INFO
diff --git a/requirements.txt b/requirements.txt
index d2e2e7a29..066c9f790 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,7 +20,7 @@ huggingface-hub==0.29.0
 idna==3.10
 jsonschema==4.23.0
 jsonschema-specifications==2024.10.1
-llama-stack-client==0.1.5
+llama-stack-client==0.1.6
 lxml==5.3.1
 markdown-it-py==3.0.0
 mdurl==0.1.2
diff --git a/uv.lock b/uv.lock
index 09ad0815e..a5c26a303 100644
--- a/uv.lock
+++ b/uv.lock
@@ -862,7 +862,7 @@ wheels = [
 
 [[package]]
 name = "llama-stack"
-version = "0.1.5"
+version = "0.1.6"
 source = { editable = "." }
 dependencies = [
     { name = "blobfile" },
@@ -946,7 +946,7 @@ requires-dist = [
     { name = "huggingface-hub" },
     { name = "jinja2", marker = "extra == 'codegen'", specifier = ">=3.1.6" },
     { name = "jsonschema" },
-    { name = "llama-stack-client", specifier = ">=0.1.5" },
+    { name = "llama-stack-client", specifier = ">=0.1.6" },
     { name = "lm-format-enforcer", marker = "extra == 'test'", specifier = ">=0.10.9" },
     { name = "myst-parser", marker = "extra == 'docs'" },
     { name = "nbval", marker = "extra == 'dev'" },
@@ -992,7 +992,7 @@ provides-extras = ["dev", "test", "docs", "codegen"]
 
 [[package]]
 name = "llama-stack-client"
-version = "0.1.5"
+version = "0.1.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -1009,9 +1009,9 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/72/26/24b8dcd97dadee66cf0b9a3cb0ee18c65a92b8732de76c1aec97d85306e2/llama_stack_client-0.1.5.tar.gz", hash = "sha256:f342969920c87d9518298fade6debecb15b7c19899eed241d61253be2bf35053", size = 261420 }
+sdist = { url = "https://files.pythonhosted.org/packages/b5/48/70ffdc7ab655234794e9559de9b1776b39610c09aaee8d3bc74bfbd570b4/llama_stack_client-0.1.6.tar.gz", hash = "sha256:92c6c55c3281839e690df7bfc289c36a5dde0f491574bbdb6b8b665dc3d5a16c", size = 264874 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ed/07/329a5220325a3a352967717e8878db1edc9c88616e36e0a1e819571067c0/llama_stack_client-0.1.5-py3-none-any.whl", hash = "sha256:2aeff88b6f836d71fd2c75d087ccc19d881fca769e05636b0ddf7b41a7c4aef8", size = 369754 },
+    { url = "https://files.pythonhosted.org/packages/38/51/1102914f819cf4412a5c9fd3f7dcc28175608e5f01ee164885972c3ec30b/llama_stack_client-0.1.6-py3-none-any.whl", hash = "sha256:708e20630d4e97a1cb03a19b933f4da6748cc857fe170998c392cf0f30f0f4c7", size = 373941 },
 ]
 
 [[package]]

From 6033e6893ede25fe542c4128b5c5f5254dbcc7a2 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Sat, 8 Mar 2025 19:20:08 -0500
Subject: [PATCH 35/58] docs: Add v0.1.6 release notes to changelog (#1506)

# What does this PR do?

Adds v0.1.6 release notes to changelog.

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 CHANGELOG.md | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2e544e93f..62862ebdc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,38 @@
 # Changelog
 
+# v0.1.6
+Published on: 2025-03-08T04:35:08Z
+
+## 0.1.6 Release Notes
+
+### Build and Test Agents
+* Inference: Fixed support for inline vllm provider
+* (**New**) Agent: Build & Monitor Agent Workflows with Llama Stack + Anthropic's Best Practice [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb)
+* (**New**) Agent: Revamped agent [documentation](https://llama-stack.readthedocs.io/en/latest/building_applications/agent.html) with more details and examples
+* Agent: Unify tools and Python SDK Agents API
+* Agent: AsyncAgent Python SDK wrapper supporting async client tool calls
+* Agent: Support python functions without @client_tool decorator as client tools
+* Agent: deprecation for allow_resume_turn flag, and remove need to specify tool_prompt_format
+* VectorIO: MilvusDB support added
+
+### Agent Evals and Model Customization
+* (**New**) Agent: Llama Stack RAG Lifecycle [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb)
+* Eval: Documentation for eval, scoring, adding new benchmarks
+* Eval: Distribution template to run benchmarks on llama & non-llama models
+* Eval: Ability to register new custom LLM-as-judge scoring functions
+* (**New**) Looking for contributors for open benchmarks. See [documentation](https://llama-stack.readthedocs.io/en/latest/references/evals_reference/index.html#open-benchmark-contributing-guide) for details.
+
+### Deploy and Monitoring of Agents
+* Better support for different log levels across all components for better monitoring
+
+### Better Engineering
+* Enhance OpenAPI spec to include Error types across all APIs
+* Moved all tests to /tests and created unit tests to run on each PR
+* Removed all dependencies on llama-models repo
+
+
+---
+
 # v0.1.5.1
 Published on: 2025-02-28T22:37:44Z
 

From 205661bc78e2a9895164d68291d19fb83bea4ba2 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Sat, 8 Mar 2025 22:56:30 -0800
Subject: [PATCH 36/58] fix: Use re-entrancy and concurrency safe context
 managers for provider data (#1498)

Concurrent requests should not trample (or reuse) each others' provider
data. Provider data should be scoped to each request.

## Test Plan

Set the uvicorn server to have a single worker process + thread by
updating the config:
```python
    uvicorn_config = {
        ...
        "workers": 1,
        "loop": "asyncio",
    }
```

Then perform the following steps on `origin/main` (without this change).

(1) Run the server using `llama stack run dev` without having
`FIREWORKS_API_KEY` in the environment.

(2) Run a test by specifying the FIREWORKS_API_KEY env var so it gets
stored in the thread local
```
pytest -s -v tests/integration/inference/test_text_inference.py \
    --stack-config http://localhost:8321 \
    --text-model accounts/fireworks/models/llama-v3p1-8b-instruct \
    -k test_text_chat_completion_with_tool_calling_and_streaming \
     --env FIREWORKS_API_KEY=<...>
```
Ensure you don't have any other API keys in the environment (otherwise
the bug will not reproduce due to other specifics in our testing code.)
Verify this works.

(3) Run the same command again without specifying FIREWORKS_API_KEY. See
that the request actually succeeds when it *should have failed*.


----
Now do the same tests on this branch, verify step (3) results in
failure.

Finally, run the full `test_text_inference.py` test suite with this
change, verify it succeeds.
---
 llama_stack/distribution/library_client.py    | 40 ++++++----
 llama_stack/distribution/request_headers.py   | 73 ++++++++++++++++---
 llama_stack/distribution/server/server.py     | 35 +++++----
 .../remote/inference/fireworks/fireworks.py   |  5 +-
 .../remote/inference/together/together.py     |  5 +-
 tests/integration/fixtures/common.py          |  2 +-
 6 files changed, 114 insertions(+), 46 deletions(-)

diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py
index 8915daf5a..ab8ff60fa 100644
--- a/llama_stack/distribution/library_client.py
+++ b/llama_stack/distribution/library_client.py
@@ -32,7 +32,10 @@ from termcolor import cprint
 from llama_stack.distribution.build import print_pip_install_help
 from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
 from llama_stack.distribution.datatypes import Api
-from llama_stack.distribution.request_headers import set_request_provider_data
+from llama_stack.distribution.request_headers import (
+    preserve_headers_context_async_generator,
+    request_provider_data_context,
+)
 from llama_stack.distribution.resolver import ProviderRegistry
 from llama_stack.distribution.server.endpoints import get_all_api_endpoints
 from llama_stack.distribution.stack import (
@@ -262,21 +265,25 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
         if not self.endpoint_impls:
             raise ValueError("Client not initialized")
 
+        # Create headers with provider data if available
+        headers = {}
         if self.provider_data:
-            set_request_provider_data({"X-LlamaStack-Provider-Data": json.dumps(self.provider_data)})
+            headers["X-LlamaStack-Provider-Data"] = json.dumps(self.provider_data)
 
-        if stream:
-            response = await self._call_streaming(
-                cast_to=cast_to,
-                options=options,
-                stream_cls=stream_cls,
-            )
-        else:
-            response = await self._call_non_streaming(
-                cast_to=cast_to,
-                options=options,
-            )
-        return response
+        # Use context manager for provider data
+        with request_provider_data_context(headers):
+            if stream:
+                response = await self._call_streaming(
+                    cast_to=cast_to,
+                    options=options,
+                    stream_cls=stream_cls,
+                )
+            else:
+                response = await self._call_non_streaming(
+                    cast_to=cast_to,
+                    options=options,
+                )
+            return response
 
     def _find_matching_endpoint(self, method: str, path: str) -> tuple[Any, dict]:
         """Find the matching endpoint implementation for a given method and path.
@@ -374,9 +381,12 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
             finally:
                 await end_trace()
 
+        # Wrap the generator to preserve context across iterations
+        wrapped_gen = preserve_headers_context_async_generator(gen())
+
         mock_response = httpx.Response(
             status_code=httpx.codes.OK,
-            content=gen(),
+            content=wrapped_gen,
             headers={
                 "Content-Type": "application/json",
             },
diff --git a/llama_stack/distribution/request_headers.py b/llama_stack/distribution/request_headers.py
index 2a9bc622a..19afae59b 100644
--- a/llama_stack/distribution/request_headers.py
+++ b/llama_stack/distribution/request_headers.py
@@ -4,16 +4,62 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import contextvars
 import json
 import logging
-import threading
-from typing import Any, Dict
+from typing import Any, AsyncGenerator, ContextManager, Dict, Optional, TypeVar
 
 from .utils.dynamic import instantiate_class_type
 
 log = logging.getLogger(__name__)
 
-_THREAD_LOCAL = threading.local()
+# Context variable for request provider data
+_provider_data_var = contextvars.ContextVar("provider_data", default=None)
+
+
+class RequestProviderDataContext(ContextManager):
+    """Context manager for request provider data"""
+
+    def __init__(self, provider_data: Optional[Dict[str, Any]] = None):
+        self.provider_data = provider_data
+        self.token = None
+
+    def __enter__(self):
+        # Save the current value and set the new one
+        self.token = _provider_data_var.set(self.provider_data)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        # Restore the previous value
+        if self.token is not None:
+            _provider_data_var.reset(self.token)
+
+
+T = TypeVar("T")
+
+
+def preserve_headers_context_async_generator(gen: AsyncGenerator[T, None]) -> AsyncGenerator[T, None]:
+    """
+    Wraps an async generator to preserve request headers context variables across iterations.
+
+    This ensures that context variables set during generator creation are
+    available during each iteration of the generator, even if the original
+    context manager has exited.
+    """
+    # Capture the current context value right now
+    context_value = _provider_data_var.get()
+
+    async def wrapper():
+        while True:
+            # Set context before each anext() call
+            _ = _provider_data_var.set(context_value)
+            try:
+                item = await gen.__anext__()
+                yield item
+            except StopAsyncIteration:
+                break
+
+    return wrapper()
 
 
 class NeedsRequestProviderData:
@@ -26,7 +72,7 @@ class NeedsRequestProviderData:
         if not validator_class:
             raise ValueError(f"Provider {provider_type} does not have a validator")
 
-        val = getattr(_THREAD_LOCAL, "provider_data_header_value", None)
+        val = _provider_data_var.get()
         if not val:
             return None
 
@@ -36,25 +82,32 @@ class NeedsRequestProviderData:
             return provider_data
         except Exception as e:
             log.error(f"Error parsing provider data: {e}")
+            return None
 
 
-def set_request_provider_data(headers: Dict[str, str]):
+def parse_request_provider_data(headers: Dict[str, str]) -> Optional[Dict[str, Any]]:
+    """Parse provider data from request headers"""
     keys = [
         "X-LlamaStack-Provider-Data",
         "x-llamastack-provider-data",
     ]
+    val = None
     for key in keys:
         val = headers.get(key, None)
         if val:
             break
 
     if not val:
-        return
+        return None
 
     try:
-        val = json.loads(val)
+        return json.loads(val)
     except json.JSONDecodeError:
-        log.error("Provider data not encoded as a JSON object!", val)
-        return
+        log.error("Provider data not encoded as a JSON object!")
+        return None
 
-    _THREAD_LOCAL.provider_data_header_value = val
+
+def request_provider_data_context(headers: Dict[str, str]) -> ContextManager:
+    """Context manager that sets request provider data from headers for the duration of the context"""
+    provider_data = parse_request_provider_data(headers)
+    return RequestProviderDataContext(provider_data)
diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index c4ef79a69..347d88a2c 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -29,7 +29,10 @@ from typing_extensions import Annotated
 
 from llama_stack.distribution.datatypes import StackRunConfig
 from llama_stack.distribution.distribution import builtin_automatically_routed_apis
-from llama_stack.distribution.request_headers import set_request_provider_data
+from llama_stack.distribution.request_headers import (
+    preserve_headers_context_async_generator,
+    request_provider_data_context,
+)
 from llama_stack.distribution.resolver import InvalidProviderError
 from llama_stack.distribution.stack import (
     construct_stack,
@@ -202,16 +205,14 @@ async def maybe_await(value):
 
 async def sse_generator(event_gen):
     try:
-        event_gen = await event_gen
-        async for item in event_gen:
+        async for item in await event_gen:
             yield create_sse_event(item)
             await asyncio.sleep(0.01)
     except asyncio.CancelledError:
         logger.info("Generator cancelled")
         await event_gen.aclose()
     except Exception as e:
-        logger.exception(f"Error in sse_generator: {e}")
-        logger.exception(f"Traceback: {''.join(traceback.format_exception(type(e), e, e.__traceback__))}")
+        logger.exception("Error in sse_generator")
         yield create_sse_event(
             {
                 "error": {
@@ -223,18 +224,20 @@ async def sse_generator(event_gen):
 
 def create_dynamic_typed_route(func: Any, method: str, route: str):
     async def endpoint(request: Request, **kwargs):
-        set_request_provider_data(request.headers)
+        # Use context manager for request provider data
+        with request_provider_data_context(request.headers):
+            is_streaming = is_streaming_request(func.__name__, request, **kwargs)
 
-        is_streaming = is_streaming_request(func.__name__, request, **kwargs)
-        try:
-            if is_streaming:
-                return StreamingResponse(sse_generator(func(**kwargs)), media_type="text/event-stream")
-            else:
-                value = func(**kwargs)
-                return await maybe_await(value)
-        except Exception as e:
-            traceback.print_exception(e)
-            raise translate_exception(e) from e
+            try:
+                if is_streaming:
+                    gen = preserve_headers_context_async_generator(sse_generator(func(**kwargs)))
+                    return StreamingResponse(gen, media_type="text/event-stream")
+                else:
+                    value = func(**kwargs)
+                    return await maybe_await(value)
+            except Exception as e:
+                logger.exception("Error executing endpoint %s", method, route)
+                raise translate_exception(e) from e
 
     sig = inspect.signature(func)
 
diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py
index ec68fb556..4acbe43f8 100644
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@@ -70,8 +70,9 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
         pass
 
     def _get_api_key(self) -> str:
-        if self.config.api_key is not None:
-            return self.config.api_key.get_secret_value()
+        config_api_key = self.config.api_key.get_secret_value() if self.config.api_key else None
+        if config_api_key:
+            return config_api_key
         else:
             provider_data = self.get_request_provider_data()
             if provider_data is None or not provider_data.fireworks_api_key:
diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py
index 2046d4aae..dfc9ae6d3 100644
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@@ -93,8 +93,9 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
 
     def _get_client(self) -> Together:
         together_api_key = None
-        if self.config.api_key is not None:
-            together_api_key = self.config.api_key.get_secret_value()
+        config_api_key = self.config.api_key.get_secret_value() if self.config.api_key else None
+        if config_api_key:
+            together_api_key = config_api_key
         else:
             provider_data = self.get_request_provider_data()
             if provider_data is None or not provider_data.together_api_key:
diff --git a/tests/integration/fixtures/common.py b/tests/integration/fixtures/common.py
index 6a75b3adf..e410039e7 100644
--- a/tests/integration/fixtures/common.py
+++ b/tests/integration/fixtures/common.py
@@ -42,7 +42,7 @@ def provider_data():
     for key, value in keymap.items():
         if os.environ.get(key):
             provider_data[value] = os.environ[key]
-    return provider_data if len(provider_data) > 0 else None
+    return provider_data
 
 
 @pytest.fixture(scope="session")

From ba917a9c485d84a61c1f7463e9653acde3fefddd Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Sat, 8 Mar 2025 23:05:10 -0800
Subject: [PATCH 37/58] fix: make sure readthedocs is triggered if
 pyproject.toml is updated

---
 .github/workflows/update-readthedocs.yml |  2 ++
 .pre-commit-config.yaml                  | 10 ----------
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/update-readthedocs.yml b/.github/workflows/update-readthedocs.yml
index 23bafa1e5..e8f14dbba 100644
--- a/.github/workflows/update-readthedocs.yml
+++ b/.github/workflows/update-readthedocs.yml
@@ -12,12 +12,14 @@ on:
       - main
     paths:
       - 'docs/**'
+      - 'pyproject.toml'
       - '.github/workflows/update-readthedocs.yml'
   pull_request:
     branches:
       - main
     paths:
       - 'docs/**'
+      - 'pyproject.toml'
       - '.github/workflows/update-readthedocs.yml'
 
 jobs:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ff51a4795..926ae21cc 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -15,10 +15,6 @@ repos:
     -   id: end-of-file-fixer
         exclude: '^(.*\.svg)$'
 
-# Temporarily disabling this
-#    -   id: no-commit-to-branch
-#        args: ['--branch=main']
-
 -   repo: https://github.com/Lucas-C/pre-commit-hooks
     rev: v1.5.4
     hooks:
@@ -68,12 +64,6 @@ repos:
           - pydantic
         pass_filenames: false
 
-# - repo: https://github.com/jsh9/pydoclint
-#   rev: d88180a8632bb1602a4d81344085cf320f288c5a
-#   hooks:
-#     - id: pydoclint
-#       args: [--config=pyproject.toml]
-
 # - repo: https://github.com/tcort/markdown-link-check
 #   rev: v3.11.2
 #   hooks:

From 70ff226b6ae404d34d66c188c2b84bad9377010f Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Sun, 9 Mar 2025 16:17:27 -0700
Subject: [PATCH 38/58] fix(library_client): ensure pending asyncio tasks like
 generator athrow are executed

---
 llama_stack/distribution/library_client.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py
index ab8ff60fa..5dc70bb67 100644
--- a/llama_stack/distribution/library_client.py
+++ b/llama_stack/distribution/library_client.py
@@ -163,6 +163,9 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
                 except StopAsyncIteration:
                     pass
                 finally:
+                    pending = asyncio.all_tasks(loop)
+                    if pending:
+                        loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
                     loop.close()
 
             return sync_generator()
@@ -383,7 +386,6 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
 
         # Wrap the generator to preserve context across iterations
         wrapped_gen = preserve_headers_context_async_generator(gen())
-
         mock_response = httpx.Response(
             status_code=httpx.codes.OK,
             content=wrapped_gen,

From a9c5d3cd3dd5f4db0d97da2b8b4552bac946999b Mon Sep 17 00:00:00 2001
From: Sarthak Deshpande <60317842+cheesecake100201@users.noreply.github.com>
Date: Mon, 10 Mar 2025 05:29:24 +0530
Subject: [PATCH 39/58] chore: made inbuilt tools blocking calls into async non
 blocking calls (#1509)

# What does this PR do?
This PR converts blocking calls for in built tools like wolfram, brave,
tavily and bing into non blocking async calls
[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]
pytest -s -v tool_runtime/test_builtin_tools.py --stack-config=together
--text-model=meta-llama/Llama-3.1-8B-Instruct
Used the command above to get the below results
<img width="1710" alt="image"
src="https://github.com/user-attachments/assets/76b0ca06-f6e4-45fa-a114-0449bef2325b"
/>


<img width="1389" alt="image"
src="https://github.com/user-attachments/assets/5220ccbb-7882-4240-b17e-f362ad46d25b"
/>

<img width="1432" alt="image"
src="https://github.com/user-attachments/assets/bb93a41e-e82a-4c98-a22d-6b0e320aa974"
/>

[//]: # (## Documentation)

---------

Co-authored-by: sarthakdeshpande <sarthak.deshpande@engati.com>
---
 .../providers/inline/vector_io/faiss/faiss.py |  3 ++-
 .../tool_runtime/bing_search/bing_search.py   | 17 +++++++++--------
 .../tool_runtime/brave_search/brave_search.py | 13 +++++++++----
 .../tavily_search/tavily_search.py            | 14 ++++++++------
 .../wolfram_alpha/wolfram_alpha.py            | 12 +++++-------
 .../utils/kvstore/mongodb/mongodb.py          | 19 ++++++++++++-------
 6 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/llama_stack/providers/inline/vector_io/faiss/faiss.py b/llama_stack/providers/inline/vector_io/faiss/faiss.py
index 410d8bd8b..0c8718cb8 100644
--- a/llama_stack/providers/inline/vector_io/faiss/faiss.py
+++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import asyncio
 import base64
 import io
 import json
@@ -99,7 +100,7 @@ class FaissIndex(EmbeddingIndex):
         await self._save_index()
 
     async def query(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
-        distances, indices = self.index.search(embedding.reshape(1, -1).astype(np.float32), k)
+        distances, indices = await asyncio.to_thread(self.index.search, embedding.reshape(1, -1).astype(np.float32), k)
 
         chunks = []
         scores = []
diff --git a/llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py b/llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py
index 826d21dd9..f494a7fbb 100644
--- a/llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py
+++ b/llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py
@@ -7,7 +7,7 @@
 import json
 from typing import Any, Dict, List, Optional
 
-import requests
+import httpx
 
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.tools import (
@@ -31,7 +31,7 @@ class BingSearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsRequestP
     async def initialize(self):
         pass
 
-    async def register_tool(self, tool: Tool):
+    async def register_tool(self, tool: Tool) -> None:
         pass
 
     async def unregister_tool(self, tool_id: str) -> None:
@@ -77,12 +77,13 @@ class BingSearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsRequestP
             "q": kwargs["query"],
         }
 
-        response = requests.get(
-            url=self.url,
-            params=params,
-            headers=headers,
-        )
-        response.raise_for_status()
+        async with httpx.AsyncClient() as client:
+            response = await client.get(
+                url=self.url,
+                params=params,
+                headers=headers,
+            )
+            response.raise_for_status()
 
         return ToolInvocationResult(content=json.dumps(self._clean_response(response.json())))
 
diff --git a/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py b/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
index 8ef9f5705..78b47eb56 100644
--- a/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
+++ b/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
@@ -6,7 +6,7 @@
 
 from typing import Any, Dict, List, Optional
 
-import requests
+import httpx
 
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.tools import (
@@ -30,7 +30,7 @@ class BraveSearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsRequest
     async def initialize(self):
         pass
 
-    async def register_tool(self, tool: Tool):
+    async def register_tool(self, tool: Tool) -> None:
         pass
 
     async def unregister_tool(self, tool_id: str) -> None:
@@ -74,8 +74,13 @@ class BraveSearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsRequest
             "Accept": "application/json",
         }
         payload = {"q": kwargs["query"]}
-        response = requests.get(url=url, params=payload, headers=headers)
-        response.raise_for_status()
+        async with httpx.AsyncClient() as client:
+            response = await client.get(
+                url=url,
+                params=payload,
+                headers=headers,
+            )
+            response.raise_for_status()
         results = self._clean_brave_response(response.json())
         content_items = "\n".join([str(result) for result in results])
         return ToolInvocationResult(
diff --git a/llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py b/llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py
index 57749894a..5b23d94d3 100644
--- a/llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py
+++ b/llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py
@@ -7,7 +7,7 @@
 import json
 from typing import Any, Dict, List, Optional
 
-import requests
+import httpx
 
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.tools import (
@@ -30,7 +30,7 @@ class TavilySearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsReques
     async def initialize(self):
         pass
 
-    async def register_tool(self, tool: Tool):
+    async def register_tool(self, tool: Tool) -> None:
         pass
 
     async def unregister_tool(self, tool_id: str) -> None:
@@ -66,10 +66,12 @@ class TavilySearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsReques
 
     async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
         api_key = self._get_api_key()
-        response = requests.post(
-            "https://api.tavily.com/search",
-            json={"api_key": api_key, "query": kwargs["query"]},
-        )
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                "https://api.tavily.com/search",
+                json={"api_key": api_key, "query": kwargs["query"]},
+            )
+            response.raise_for_status()
 
         return ToolInvocationResult(content=json.dumps(self._clean_tavily_response(response.json())))
 
diff --git a/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py b/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py
index 08529384a..8489fa7d8 100644
--- a/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py
+++ b/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py
@@ -7,7 +7,7 @@
 import json
 from typing import Any, Dict, List, Optional
 
-import requests
+import httpx
 
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.tools import (
@@ -31,7 +31,7 @@ class WolframAlphaToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsReques
     async def initialize(self):
         pass
 
-    async def register_tool(self, tool: Tool):
+    async def register_tool(self, tool: Tool) -> None:
         pass
 
     async def unregister_tool(self, tool_id: str) -> None:
@@ -73,11 +73,9 @@ class WolframAlphaToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsReques
             "format": "plaintext",
             "output": "json",
         }
-        response = requests.get(
-            self.url,
-            params=params,
-        )
-
+        async with httpx.AsyncClient() as client:
+            response = await client.get(params=params, url=self.url)
+            response.raise_for_status()
         return ToolInvocationResult(content=json.dumps(self._clean_wolfram_alpha_response(response.json())))
 
     def _clean_wolfram_alpha_response(self, wa_response):
diff --git a/llama_stack/providers/utils/kvstore/mongodb/mongodb.py b/llama_stack/providers/utils/kvstore/mongodb/mongodb.py
index 965b4e213..c1581dc8d 100644
--- a/llama_stack/providers/utils/kvstore/mongodb/mongodb.py
+++ b/llama_stack/providers/utils/kvstore/mongodb/mongodb.py
@@ -8,9 +8,11 @@ import logging
 from datetime import datetime
 from typing import List, Optional
 
-from pymongo import MongoClient
+from pymongo import AsyncMongoClient
 
-from llama_stack.providers.utils.kvstore import KVStore, MongoDBKVStoreConfig
+from llama_stack.providers.utils.kvstore import KVStore
+
+from ..config import MongoDBKVStoreConfig
 
 log = logging.getLogger(__name__)
 
@@ -30,7 +32,7 @@ class MongoDBKVStoreImpl(KVStore):
                 "password": self.config.password,
             }
             conn_creds = {k: v for k, v in conn_creds.items() if v is not None}
-            self.conn = MongoClient(**conn_creds)
+            self.conn = AsyncMongoClient(**conn_creds)
             self.collection = self.conn[self.config.db][self.config.collection_name]
         except Exception as e:
             log.exception("Could not connect to MongoDB database server")
@@ -44,17 +46,17 @@ class MongoDBKVStoreImpl(KVStore):
     async def set(self, key: str, value: str, expiration: Optional[datetime] = None) -> None:
         key = self._namespaced_key(key)
         update_query = {"$set": {"value": value, "expiration": expiration}}
-        self.collection.update_one({"key": key}, update_query, upsert=True)
+        await self.collection.update_one({"key": key}, update_query, upsert=True)
 
     async def get(self, key: str) -> Optional[str]:
         key = self._namespaced_key(key)
         query = {"key": key}
-        result = self.collection.find_one(query, {"value": 1, "_id": 0})
+        result = await self.collection.find_one(query, {"value": 1, "_id": 0})
         return result["value"] if result else None
 
     async def delete(self, key: str) -> None:
         key = self._namespaced_key(key)
-        self.collection.delete_one({"key": key})
+        await self.collection.delete_one({"key": key})
 
     async def range(self, start_key: str, end_key: str) -> List[str]:
         start_key = self._namespaced_key(start_key)
@@ -63,4 +65,7 @@ class MongoDBKVStoreImpl(KVStore):
             "key": {"$gte": start_key, "$lt": end_key},
         }
         cursor = self.collection.find(query, {"value": 1, "_id": 0}).sort("key", 1)
-        return [doc["value"] for doc in cursor]
+        result = []
+        async for doc in cursor:
+            result.append(doc["value"])
+        return result

From d045b8830f7b3ee0d06b2c0697efe132d8973cf8 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Mon, 10 Mar 2025 10:42:05 -0700
Subject: [PATCH 40/58] docs: update prompt for websearch example (#1520)

Summary:
model is sometimes reluctant to use tools by default.

Test Plan:
run in notebook
---
 docs/getting_started.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb
index 513335c52..01e63fc4f 100644
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@@ -1640,7 +1640,7 @@
         "agent = Agent(\n",
         "    client, \n",
         "    model=model_id,\n",
-        "    instructions=\"You are a helpful assistant\",\n",
+        "    instructions=\"You are a helpful assistant. Use websearch tool to help answer questions.\",\n",
         "    tools=[\"builtin::websearch\"],\n",
         ")\n",
         "user_prompts = [\n",

From 8814111da12034f247561e2ffe793d6480e578d6 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Tue, 11 Mar 2025 02:38:07 +0800
Subject: [PATCH 41/58] docs: improve eval doc (#1501)

# What does this PR do?
[Provide a short summary of what this PR does and why. Link to relevant
issues if applicable.]

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

[//]: # (## Documentation)

Signed-off-by: reidliu <reid201711@gmail.com>
Co-authored-by: reidliu <reid201711@gmail.com>
---
 docs/source/building_applications/evals.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/source/building_applications/evals.md b/docs/source/building_applications/evals.md
index fc1270bf6..211d3bc26 100644
--- a/docs/source/building_applications/evals.md
+++ b/docs/source/building_applications/evals.md
@@ -23,9 +23,12 @@ In this example, we will show you how to:
 
 ##### Building a Search Agent
 ```python
+from llama_stack_client import LlamaStackClient
 from llama_stack_client.lib.agents.agent import Agent
 from llama_stack_client.lib.agents.event_logger import EventLogger
 
+client = LlamaStackClient(base_url=f"http://{HOST}:{PORT}")
+
 agent = Agent(
     client,
     model="meta-llama/Llama-3.3-70B-Instruct",
@@ -33,7 +36,7 @@ agent = Agent(
     tools=["builtin::websearch"],
 )
 user_prompts = [
-    "Which teams played in the NBA western conference finals of 2024. Search the web for the answer.",
+    "Which teams played in the NBA Western Conference Finals of 2024. Search the web for the answer.",
     "In which episode and season of South Park does Bill Cosby (BSM-471) first appear? Give me the number and title. Search the web for the answer.",
     "What is the British-American kickboxer Andrew Tate's kickboxing name? Search the web for the answer.",
 ]

From 23278d1e5dfbe7930af2ec7969a20b813b60cf20 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 10 Mar 2025 13:03:57 -0700
Subject: [PATCH 42/58] fix: update getting_started structured decoding cell
 (#1523)

# What does this PR do?

- Together's inference only supports 3.1 for structured decoding

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
```
pytest -v -s --nbval-lax ./docs/getting_started.ipynb
```

[//]: # (## Documentation)
---
 docs/getting_started.ipynb | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb
index 01e63fc4f..fd625a394 100644
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@@ -1267,7 +1267,6 @@
         }
       ],
       "source": [
-        "# NBVAL_SKIP\n",
         "from pydantic import BaseModel\n",
         "\n",
         "\n",
@@ -1279,7 +1278,7 @@
         "\n",
         "user_input = \"Michael Jordan was born in 1963. He played basketball for the Chicago Bulls. He retired in 2003. Extract this information into JSON for me. \"\n",
         "response = client.inference.completion(\n",
-        "    model_id=model_id,\n",
+        "    model_id=\"meta-llama/Llama-3.1-8B-Instruct\",\n",
         "    content=user_input,\n",
         "    stream=False,\n",
         "    sampling_params={\n",

From 0b8cb830b9280796cb8a300e9298b096c9fcd6d7 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Tue, 11 Mar 2025 04:04:59 +0800
Subject: [PATCH 43/58] docs: update ollama doc url (#1508)

# What does this PR do?
[Provide a short summary of what this PR does and why. Link to relevant
issues if applicable.]

It should changed in this pr
https://github.com/meta-llama/llama-stack/pull/1190/files#diff-53e3f35ced54ee5e57dc8b0d3b04770ed84f2f6434c6f492f42569b3c2810ecd

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

[//]: # (## Documentation)

Signed-off-by: reidliu <reid201711@gmail.com>
Co-authored-by: reidliu <reid201711@gmail.com>
---
 docs/source/distributions/self_hosted_distro/ollama.md | 2 +-
 docs/zero_to_hero_guide/README.md                      | 2 +-
 llama_stack/templates/ollama/doc_template.md           | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md
index fb3f9164a..a6390de34 100644
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@@ -130,7 +130,7 @@ llama stack run ./run-with-safety.yaml \
 ### (Optional) Update Model Serving Configuration
 
 ```{note}
-Please check the [model_entries](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L45) for the supported Ollama models.
+Please check the [model_entries](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/models.py) for the supported Ollama models.
 ```
 
 To serve a new model with `ollama`
diff --git a/docs/zero_to_hero_guide/README.md b/docs/zero_to_hero_guide/README.md
index 98f40bc3c..2d94a7204 100644
--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@@ -40,7 +40,7 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
    ollama run llama3.2:3b-instruct-fp16 --keepalive -1m
    ```
    **Note**:
-     - The supported models for llama stack for now is listed in [here](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L43)
+     - The supported models for llama stack for now is listed in [here](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/models.py)
      - `keepalive -1m` is used so that ollama continues to keep the model in memory indefinitely. Otherwise, ollama frees up memory and you would have to run `ollama run` again.
 
 ---
diff --git a/llama_stack/templates/ollama/doc_template.md b/llama_stack/templates/ollama/doc_template.md
index e5444d3da..8964260a6 100644
--- a/llama_stack/templates/ollama/doc_template.md
+++ b/llama_stack/templates/ollama/doc_template.md
@@ -119,7 +119,7 @@ llama stack run ./run-with-safety.yaml \
 ### (Optional) Update Model Serving Configuration
 
 ```{note}
-Please check the [model_entries](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L45) for the supported Ollama models.
+Please check the [model_entries](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/models.py) for the supported Ollama models.
 ```
 
 To serve a new model with `ollama`

From 6dbac3beede0b961145166428b51bb8347b75e38 Mon Sep 17 00:00:00 2001
From: Courtney Pacheco <6019922+courtneypacheco@users.noreply.github.com>
Date: Mon, 10 Mar 2025 16:27:33 -0400
Subject: [PATCH 44/58] chore: Display code coverage for unit tests in PR
 builds (#1512)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?
This PR allows for unit test code coverage % to be reported in PR
builds. Currently, today's output tells the end user which tests passed
and which tests failed:

<img width="744" alt="Screenshot 2025-03-10 at 9 44 28 AM"
src="https://github.com/user-attachments/assets/40b1a578-951f-4b74-8a37-a39c039b1d7e"
/>

If a contributor is creating a new module within Llama Stack and starts
writing unit tests for that module, it might be difficult for Llama
Stack maintainers to immediately determine the code coverage percentage
for that new module.

To allow for code coverage reporting in the CI, we simply need to
install `pytest-cov` so we can use the `--cov` flag with the existing
`pytest` command.

Ideally, it would be nicer to have a bot report code coverage, but this
PR can be a temporary solution.

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
I ran these changes locally:

<img width="1455" alt="Screenshot 2025-03-10 at 10 01 53 AM"
src="https://github.com/user-attachments/assets/dfd765c6-5979-42a3-b899-7713a3f202e6"
/>

PR build to confirm the expected behavior:
<img width="1326" alt="Screenshot 2025-03-10 at 12 47 36 PM"
src="https://github.com/user-attachments/assets/fe94f1e6-fbb5-4e57-9902-197502c50621"
/>


[//]: # (## Documentation)

Signed-off-by: Courtney Pacheco <6019922+courtneypacheco@users.noreply.github.com>
---
 .github/workflows/unit-tests.yml |  2 +-
 .gitignore                       |  1 +
 pyproject.toml                   |  1 +
 uv.lock                          | 30 +++++++++++++++++++++++++-----
 4 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 28e749aff..dc17cbc51 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -23,7 +23,7 @@ jobs:
 
       - name: Run unit tests
         run: |
-          uv run -p 3.10.16 --with . --with ".[dev]" --with ".[test]" pytest -s -v tests/unit/ --junitxml=pytest-report.xml
+          uv run -p 3.10.16 --with . --with ".[dev]" --with ".[test]" pytest --cov=. -s -v tests/unit/ --junitxml=pytest-report.xml
 
       - name: Upload test results
         if: always()
diff --git a/.gitignore b/.gitignore
index 163b65947..1b15107f3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,4 @@ docs/src
 pyrightconfig.json
 venv/
 pytest-report.xml
+.coverage
diff --git a/pyproject.toml b/pyproject.toml
index 077214354..f724b20ef 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,6 +42,7 @@ dependencies = [
 dev = [
     "pytest",
     "pytest-asyncio",
+    "pytest-cov",
     "pytest-html",
     "nbval",            # For notebook testing
     "black",
diff --git a/uv.lock b/uv.lock
index a5c26a303..d6bf6ce51 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,4 @@
 version = 1
-revision = 1
 requires-python = ">=3.10"
 resolution-markers = [
     "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
@@ -293,7 +292,7 @@ name = "click"
 version = "8.1.8"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "colorama", marker = "(platform_machine != 'aarch64' and platform_system == 'Windows' and sys_platform == 'linux') or (platform_system == 'Windows' and sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 }
 wheels = [
@@ -381,6 +380,11 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fb/b2/f655700e1024dec98b10ebaafd0cedbc25e40e4abe62a3c8e2ceef4f8f0a/coverage-7.6.12-py3-none-any.whl", hash = "sha256:eb8668cfbc279a536c633137deeb9435d2962caec279c3f8cf8b91fff6ff8953", size = 200552 },
 ]
 
+[package.optional-dependencies]
+toml = [
+    { name = "tomli", marker = "python_full_version <= '3.11'" },
+]
+
 [[package]]
 name = "debugpy"
 version = "1.8.12"
@@ -679,7 +683,7 @@ name = "ipykernel"
 version = "6.29.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "appnope", marker = "sys_platform == 'darwin'" },
+    { name = "appnope", marker = "(platform_machine != 'aarch64' and platform_system == 'Darwin') or (platform_system == 'Darwin' and sys_platform != 'linux')" },
     { name = "comm" },
     { name = "debugpy" },
     { name = "ipython" },
@@ -895,6 +899,7 @@ dev = [
     { name = "pre-commit" },
     { name = "pytest" },
     { name = "pytest-asyncio" },
+    { name = "pytest-cov" },
     { name = "pytest-html" },
     { name = "ruamel-yaml" },
     { name = "ruff" },
@@ -962,6 +967,7 @@ requires-dist = [
     { name = "pypdf", marker = "extra == 'test'" },
     { name = "pytest", marker = "extra == 'dev'" },
     { name = "pytest-asyncio", marker = "extra == 'dev'" },
+    { name = "pytest-cov", marker = "extra == 'dev'" },
     { name = "pytest-html", marker = "extra == 'dev'" },
     { name = "python-dotenv" },
     { name = "requests" },
@@ -988,7 +994,6 @@ requires-dist = [
     { name = "types-setuptools", marker = "extra == 'dev'" },
     { name = "uvicorn", marker = "extra == 'dev'" },
 ]
-provides-extras = ["dev", "test", "docs", "codegen"]
 
 [[package]]
 name = "llama-stack-client"
@@ -1767,6 +1772,8 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/61/74/49f5d20c514ccc631b940cc9dfec45dcce418dc84a98463a2e2ebec33904/pycryptodomex-3.21.0-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:52e23a0a6e61691134aa8c8beba89de420602541afaae70f66e16060fdcd677e", size = 2257982 },
     { url = "https://files.pythonhosted.org/packages/92/4b/d33ef74e2cc0025a259936661bb53432c5bbbadc561c5f2e023bcd73ce4c/pycryptodomex-3.21.0-cp36-abi3-win32.whl", hash = "sha256:a3d77919e6ff56d89aada1bd009b727b874d464cb0e2e3f00a49f7d2e709d76e", size = 1779052 },
     { url = "https://files.pythonhosted.org/packages/5b/be/7c991840af1184009fc86267160948350d1bf875f153c97bb471ad944e40/pycryptodomex-3.21.0-cp36-abi3-win_amd64.whl", hash = "sha256:b0e9765f93fe4890f39875e6c90c96cb341767833cfa767f41b490b506fa9ec0", size = 1816307 },
+    { url = "https://files.pythonhosted.org/packages/af/ac/24125ad36778914a36f08d61ba5338cb9159382c638d9761ee19c8de822c/pycryptodomex-3.21.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:feaecdce4e5c0045e7a287de0c4351284391fe170729aa9182f6bd967631b3a8", size = 1694999 },
+    { url = "https://files.pythonhosted.org/packages/93/73/be7a54a5903508070e5508925ba94493a1f326cfeecfff750e3eb250ea28/pycryptodomex-3.21.0-pp27-pypy_73-win32.whl", hash = "sha256:365aa5a66d52fd1f9e0530ea97f392c48c409c2f01ff8b9a39c73ed6f527d36c", size = 1769437 },
     { url = "https://files.pythonhosted.org/packages/e5/9f/39a6187f3986841fa6a9f35c6fdca5030ef73ff708b45a993813a51d7d10/pycryptodomex-3.21.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3efddfc50ac0ca143364042324046800c126a1d63816d532f2e19e6f2d8c0c31", size = 1619607 },
     { url = "https://files.pythonhosted.org/packages/f8/70/60bb08e9e9841b18d4669fb69d84b64ce900aacd7eb0ebebd4c7b9bdecd3/pycryptodomex-3.21.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df2608682db8279a9ebbaf05a72f62a321433522ed0e499bc486a6889b96bf3", size = 1653571 },
     { url = "https://files.pythonhosted.org/packages/c9/6f/191b73509291c5ff0dddec9cc54797b1d73303c12b2e4017b24678e57099/pycryptodomex-3.21.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5823d03e904ea3e53aebd6799d6b8ec63b7675b5d2f4a4bd5e3adcb512d03b37", size = 1691548 },
@@ -1912,6 +1919,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/67/17/3493c5624e48fd97156ebaec380dcaafee9506d7e2c46218ceebbb57d7de/pytest_asyncio-0.25.3-py3-none-any.whl", hash = "sha256:9e89518e0f9bd08928f97a3482fdc4e244df17529460bc038291ccaf8f85c7c3", size = 19467 },
 ]
 
+[[package]]
+name = "pytest-cov"
+version = "6.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "coverage", extra = ["toml"] },
+    { name = "pytest" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/be/45/9b538de8cef30e17c7b45ef42f538a94889ed6a16f2387a6c89e73220651/pytest-cov-6.0.0.tar.gz", hash = "sha256:fde0b595ca248bb8e2d76f020b465f3b107c9632e6a1d1705f17834c89dcadc0", size = 66945 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/36/3b/48e79f2cd6a61dbbd4807b4ed46cb564b4fd50a76166b1c4ea5c1d9e2371/pytest_cov-6.0.0-py3-none-any.whl", hash = "sha256:eee6f1b9e61008bd34975a4d5bab25801eb31898b032dd55addc93e96fcaaa35", size = 22949 },
+]
+
 [[package]]
 name = "pytest-html"
 version = "4.1.1"
@@ -2893,7 +2913,7 @@ name = "tqdm"
 version = "4.67.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "colorama", marker = "(platform_machine != 'aarch64' and platform_system == 'Windows' and sys_platform == 'linux') or (platform_system == 'Windows' and sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 }
 wheels = [

From 735892cbd2244481569839001bed829975ec3489 Mon Sep 17 00:00:00 2001
From: James Kunstle <52969093+JamesKunstle@users.noreply.github.com>
Date: Mon, 10 Mar 2025 14:12:53 -0700
Subject: [PATCH 45/58] refactor: `ImageType` to `LlamaStackImageType` (#1500)

This disambiguates "Image" term from "container image" alternative usage
and allows for:

```python

if image_type == LlamaStackImagetype.venv:
	...

```

accesses rather than `ImageType.venv.value`

# What does this PR do?
[Provide a short summary of what this PR does and why. Link to relevant
issues if applicable.]

Changes enum use to comply with semantic python styling and naming
conventions.

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

Refactor was automated and small so simple run-through of creating
images was done.

Signed-off-by: James Kunstle <jkunstle@redhat.com>
---
 llama_stack/cli/stack/_build.py               | 12 ++++++------
 llama_stack/distribution/build.py             |  8 ++++----
 llama_stack/distribution/utils/exec.py        |  6 +++---
 llama_stack/distribution/utils/image_types.py | 10 +++++-----
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/llama_stack/cli/stack/_build.py b/llama_stack/cli/stack/_build.py
index 1b2470918..3887bf4f9 100644
--- a/llama_stack/cli/stack/_build.py
+++ b/llama_stack/cli/stack/_build.py
@@ -39,7 +39,7 @@ from llama_stack.distribution.resolver import InvalidProviderError
 from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
 from llama_stack.distribution.utils.dynamic import instantiate_class_type
 from llama_stack.distribution.utils.exec import formulate_run_args, run_with_pty
-from llama_stack.distribution.utils.image_types import ImageType
+from llama_stack.distribution.utils.image_types import LlamaStackImageType
 from llama_stack.providers.datatypes import Api
 
 TEMPLATES_PATH = Path(__file__).parent.parent.parent / "templates"
@@ -170,7 +170,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
                 )
                 sys.exit(1)
 
-        if build_config.image_type == ImageType.container.value and not args.image_name:
+        if build_config.image_type == LlamaStackImageType.CONTAINER.value and not args.image_name:
             cprint(
                 "Please specify --image-name when building a container from a config file",
                 color="red",
@@ -226,7 +226,7 @@ def _generate_run_config(
     """
     apis = list(build_config.distribution_spec.providers.keys())
     run_config = StackRunConfig(
-        container_image=(image_name if build_config.image_type == ImageType.container.value else None),
+        container_image=(image_name if build_config.image_type == LlamaStackImageType.CONTAINER.value else None),
         image_name=image_name,
         apis=apis,
         providers={},
@@ -279,16 +279,16 @@ def _run_stack_build_command_from_build_config(
     template_name: Optional[str] = None,
     config_path: Optional[str] = None,
 ) -> str:
-    if build_config.image_type == ImageType.container.value:
+    if build_config.image_type == LlamaStackImageType.CONTAINER.value:
         if template_name:
             image_name = f"distribution-{template_name}"
         else:
             if not image_name:
                 raise ValueError("Please specify an image name when building a container image without a template")
-    elif build_config.image_type == ImageType.conda.value:
+    elif build_config.image_type == LlamaStackImageType.CONDA.value:
         if not image_name:
             raise ValueError("Please specify an image name when building a conda image")
-    elif build_config.image_type == ImageType.venv.value:
+    elif build_config.image_type == LlamaStackImageType.VENV.value:
         if not image_name and os.environ.get("UV_SYSTEM_PYTHON"):
             image_name = "__system__"
         if not image_name:
diff --git a/llama_stack/distribution/build.py b/llama_stack/distribution/build.py
index 3d808a4a4..0e990d129 100644
--- a/llama_stack/distribution/build.py
+++ b/llama_stack/distribution/build.py
@@ -16,7 +16,7 @@ from termcolor import cprint
 from llama_stack.distribution.datatypes import BuildConfig, Provider
 from llama_stack.distribution.distribution import get_provider_registry
 from llama_stack.distribution.utils.exec import run_command, run_with_pty
-from llama_stack.distribution.utils.image_types import ImageType
+from llama_stack.distribution.utils.image_types import LlamaStackImageType
 from llama_stack.providers.datatypes import Api
 
 log = logging.getLogger(__name__)
@@ -95,7 +95,7 @@ def build_image(
     normal_deps, special_deps = get_provider_dependencies(build_config.distribution_spec.providers)
     normal_deps += SERVER_DEPENDENCIES
 
-    if build_config.image_type == ImageType.container.value:
+    if build_config.image_type == LlamaStackImageType.CONTAINER.value:
         script = str(importlib.resources.files("llama_stack") / "distribution/build_container.sh")
         args = [
             script,
@@ -104,7 +104,7 @@ def build_image(
             container_base,
             " ".join(normal_deps),
         ]
-    elif build_config.image_type == ImageType.conda.value:
+    elif build_config.image_type == LlamaStackImageType.CONDA.value:
         script = str(importlib.resources.files("llama_stack") / "distribution/build_conda_env.sh")
         args = [
             script,
@@ -112,7 +112,7 @@ def build_image(
             str(build_file_path),
             " ".join(normal_deps),
         ]
-    elif build_config.image_type == ImageType.venv.value:
+    elif build_config.image_type == LlamaStackImageType.VENV.value:
         script = str(importlib.resources.files("llama_stack") / "distribution/build_venv.sh")
         args = [
             script,
diff --git a/llama_stack/distribution/utils/exec.py b/llama_stack/distribution/utils/exec.py
index aae6b35d8..86613dc9c 100644
--- a/llama_stack/distribution/utils/exec.py
+++ b/llama_stack/distribution/utils/exec.py
@@ -20,14 +20,14 @@ import importlib
 import json
 from pathlib import Path
 
-from llama_stack.distribution.utils.image_types import ImageType
+from llama_stack.distribution.utils.image_types import LlamaStackImageType
 
 
 def formulate_run_args(image_type, image_name, config, template_name) -> list:
     env_name = ""
-    if image_type == ImageType.container.value or config.container_image:
+    if image_type == LlamaStackImageType.CONTAINER.value or config.container_image:
         env_name = f"distribution-{template_name}" if template_name else config.container_image
-    elif image_type == ImageType.conda.value:
+    elif image_type == LlamaStackImageType.CONDA.value:
         current_conda_env = os.environ.get("CONDA_DEFAULT_ENV")
         env_name = image_name or current_conda_env
         if not env_name:
diff --git a/llama_stack/distribution/utils/image_types.py b/llama_stack/distribution/utils/image_types.py
index 1a43b092f..403c91ca6 100644
--- a/llama_stack/distribution/utils/image_types.py
+++ b/llama_stack/distribution/utils/image_types.py
@@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from enum import Enum
+import enum
 
 
-class ImageType(Enum):
-    container = "container"
-    conda = "conda"
-    venv = "venv"
+class LlamaStackImageType(enum.Enum):
+    CONTAINER = "container"
+    CONDA = "conda"
+    VENV = "venv"

From bc8daf7feabc653d18a300f41b55c0ac6c78b8f3 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Mon, 10 Mar 2025 14:59:11 -0700
Subject: [PATCH 46/58] fix: include jinja2 as a core llama-stack dependency
 (#1529)

We removed `llama-models` as a dep which was pulling this in for us
previously. This did not get caught in the release process because the
distros we use for testing (fireworks / together) pull that in via
sentence transformers which we don't use in all distros (notably
ollama.)

See #1511

## Test Plan

Ran `llama-stack-ops/actions/test-and-cut/main.sh` with
`ONLY_TEST_DONT_CUT=1 COMMIT_ID=origin/fix_jinja2` and by making it
build the ollama docker. Ran the docker to ensure it does not error out
with jinja2 dependency error. (Unfortunately there is another error with
sqlite_vec there.)
---
 pyproject.toml   |  1 +
 requirements.txt |  2 ++
 uv.lock          | 10 +++++-----
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f724b20ef..b2412bee9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,7 @@ dependencies = [
     "fire",
     "httpx",
     "huggingface-hub",
+    "jinja2>=3.1.6",
     "jsonschema",
     "llama-stack-client>=0.1.6",
     "prompt-toolkit",
diff --git a/requirements.txt b/requirements.txt
index 066c9f790..ae8a0af9f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,11 +18,13 @@ httpcore==1.0.7
 httpx==0.28.1
 huggingface-hub==0.29.0
 idna==3.10
+jinja2==3.1.6
 jsonschema==4.23.0
 jsonschema-specifications==2024.10.1
 llama-stack-client==0.1.6
 lxml==5.3.1
 markdown-it-py==3.0.0
+markupsafe==3.0.2
 mdurl==0.1.2
 numpy==2.2.3
 packaging==24.2
diff --git a/uv.lock b/uv.lock
index d6bf6ce51..db48f9876 100644
--- a/uv.lock
+++ b/uv.lock
@@ -292,7 +292,7 @@ name = "click"
 version = "8.1.8"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "colorama", marker = "(platform_machine != 'aarch64' and platform_system == 'Windows' and sys_platform == 'linux') or (platform_system == 'Windows' and sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "colorama", marker = "sys_platform == 'win32'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 }
 wheels = [
@@ -683,7 +683,7 @@ name = "ipykernel"
 version = "6.29.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "appnope", marker = "(platform_machine != 'aarch64' and platform_system == 'Darwin') or (platform_system == 'Darwin' and sys_platform != 'linux')" },
+    { name = "appnope", marker = "sys_platform == 'darwin'" },
     { name = "comm" },
     { name = "debugpy" },
     { name = "ipython" },
@@ -873,6 +873,7 @@ dependencies = [
     { name = "fire" },
     { name = "httpx" },
     { name = "huggingface-hub" },
+    { name = "jinja2" },
     { name = "jsonschema" },
     { name = "llama-stack-client" },
     { name = "pillow" },
@@ -949,6 +950,7 @@ requires-dist = [
     { name = "groq", marker = "extra == 'test'" },
     { name = "httpx" },
     { name = "huggingface-hub" },
+    { name = "jinja2", specifier = ">=3.1.6" },
     { name = "jinja2", marker = "extra == 'codegen'", specifier = ">=3.1.6" },
     { name = "jsonschema" },
     { name = "llama-stack-client", specifier = ">=0.1.6" },
@@ -1772,8 +1774,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/61/74/49f5d20c514ccc631b940cc9dfec45dcce418dc84a98463a2e2ebec33904/pycryptodomex-3.21.0-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:52e23a0a6e61691134aa8c8beba89de420602541afaae70f66e16060fdcd677e", size = 2257982 },
     { url = "https://files.pythonhosted.org/packages/92/4b/d33ef74e2cc0025a259936661bb53432c5bbbadc561c5f2e023bcd73ce4c/pycryptodomex-3.21.0-cp36-abi3-win32.whl", hash = "sha256:a3d77919e6ff56d89aada1bd009b727b874d464cb0e2e3f00a49f7d2e709d76e", size = 1779052 },
     { url = "https://files.pythonhosted.org/packages/5b/be/7c991840af1184009fc86267160948350d1bf875f153c97bb471ad944e40/pycryptodomex-3.21.0-cp36-abi3-win_amd64.whl", hash = "sha256:b0e9765f93fe4890f39875e6c90c96cb341767833cfa767f41b490b506fa9ec0", size = 1816307 },
-    { url = "https://files.pythonhosted.org/packages/af/ac/24125ad36778914a36f08d61ba5338cb9159382c638d9761ee19c8de822c/pycryptodomex-3.21.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:feaecdce4e5c0045e7a287de0c4351284391fe170729aa9182f6bd967631b3a8", size = 1694999 },
-    { url = "https://files.pythonhosted.org/packages/93/73/be7a54a5903508070e5508925ba94493a1f326cfeecfff750e3eb250ea28/pycryptodomex-3.21.0-pp27-pypy_73-win32.whl", hash = "sha256:365aa5a66d52fd1f9e0530ea97f392c48c409c2f01ff8b9a39c73ed6f527d36c", size = 1769437 },
     { url = "https://files.pythonhosted.org/packages/e5/9f/39a6187f3986841fa6a9f35c6fdca5030ef73ff708b45a993813a51d7d10/pycryptodomex-3.21.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3efddfc50ac0ca143364042324046800c126a1d63816d532f2e19e6f2d8c0c31", size = 1619607 },
     { url = "https://files.pythonhosted.org/packages/f8/70/60bb08e9e9841b18d4669fb69d84b64ce900aacd7eb0ebebd4c7b9bdecd3/pycryptodomex-3.21.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df2608682db8279a9ebbaf05a72f62a321433522ed0e499bc486a6889b96bf3", size = 1653571 },
     { url = "https://files.pythonhosted.org/packages/c9/6f/191b73509291c5ff0dddec9cc54797b1d73303c12b2e4017b24678e57099/pycryptodomex-3.21.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5823d03e904ea3e53aebd6799d6b8ec63b7675b5d2f4a4bd5e3adcb512d03b37", size = 1691548 },
@@ -2913,7 +2913,7 @@ name = "tqdm"
 version = "4.67.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "colorama", marker = "(platform_machine != 'aarch64' and platform_system == 'Windows' and sys_platform == 'linux') or (platform_system == 'Windows' and sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "colorama", marker = "sys_platform == 'win32'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 }
 wheels = [

From 921f8b1125b22ae856bf481a4ea2f1a09544ff8e Mon Sep 17 00:00:00 2001
From: Sarthak Deshpande <60317842+cheesecake100201@users.noreply.github.com>
Date: Tue, 11 Mar 2025 03:55:01 +0530
Subject: [PATCH 47/58] chore: Together async client (#1510)

# What does this PR do?
Uses together async client instead of sync client

[//]: # (If resolving an issue, uncomment and update the line below)

## Test Plan
Command to run the test is in the image below(2 tests fail, and they
were failing for the old stable version as well with the same errors.)
<img width="1689" alt="image"
src="https://github.com/user-attachments/assets/503db720-5379-425d-9844-0225010e41a1"
/>


[//]: # (## Documentation)

---------

Co-authored-by: sarthakdeshpande <sarthak.deshpande@engati.com>
---
 .../remote/inference/together/together.py     | 71 +++++++++----------
 1 file changed, 34 insertions(+), 37 deletions(-)

diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py
index dfc9ae6d3..a4e02f2cb 100644
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@@ -6,7 +6,7 @@
 
 from typing import AsyncGenerator, List, Optional, Union
 
-from together import Together
+from together import AsyncTogether
 
 from llama_stack.apis.common.content_types import (
     InterleavedContent,
@@ -59,12 +59,15 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
     def __init__(self, config: TogetherImplConfig) -> None:
         ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
         self.config = config
+        self._client = None
 
     async def initialize(self) -> None:
         pass
 
     async def shutdown(self) -> None:
-        pass
+        if self._client:
+            await self._client.close()
+            self._client = None
 
     async def completion(
         self,
@@ -91,35 +94,32 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
         else:
             return await self._nonstream_completion(request)
 
-    def _get_client(self) -> Together:
-        together_api_key = None
-        config_api_key = self.config.api_key.get_secret_value() if self.config.api_key else None
-        if config_api_key:
-            together_api_key = config_api_key
-        else:
-            provider_data = self.get_request_provider_data()
-            if provider_data is None or not provider_data.together_api_key:
-                raise ValueError(
-                    'Pass Together API Key in the header X-LlamaStack-Provider-Data as { "together_api_key": <your api key>}'
-                )
-            together_api_key = provider_data.together_api_key
-        return Together(api_key=together_api_key)
+    def _get_client(self) -> AsyncTogether:
+        if not self._client:
+            together_api_key = None
+            config_api_key = self.config.api_key.get_secret_value() if self.config.api_key else None
+            if config_api_key:
+                together_api_key = config_api_key
+            else:
+                provider_data = self.get_request_provider_data()
+                if provider_data is None or not provider_data.together_api_key:
+                    raise ValueError(
+                        'Pass Together API Key in the header X-LlamaStack-Provider-Data as { "together_api_key": <your api key>}'
+                    )
+                together_api_key = provider_data.together_api_key
+            self._client = AsyncTogether(api_key=together_api_key)
+        return self._client
 
     async def _nonstream_completion(self, request: CompletionRequest) -> ChatCompletionResponse:
         params = await self._get_params(request)
-        r = self._get_client().completions.create(**params)
+        client = self._get_client()
+        r = await client.completions.create(**params)
         return process_completion_response(r)
 
     async def _stream_completion(self, request: CompletionRequest) -> AsyncGenerator:
         params = await self._get_params(request)
-
-        # if we shift to TogetherAsyncClient, we won't need this wrapper
-        async def _to_async_generator():
-            s = self._get_client().completions.create(**params)
-            for chunk in s:
-                yield chunk
-
-        stream = _to_async_generator()
+        client = await self._get_client()
+        stream = await client.completions.create(**params)
         async for chunk in process_completion_stream_response(stream):
             yield chunk
 
@@ -184,25 +184,21 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
 
     async def _nonstream_chat_completion(self, request: ChatCompletionRequest) -> ChatCompletionResponse:
         params = await self._get_params(request)
+        client = self._get_client()
         if "messages" in params:
-            r = self._get_client().chat.completions.create(**params)
+            r = await client.chat.completions.create(**params)
         else:
-            r = self._get_client().completions.create(**params)
+            r = await client.completions.create(**params)
         return process_chat_completion_response(r, request)
 
     async def _stream_chat_completion(self, request: ChatCompletionRequest) -> AsyncGenerator:
         params = await self._get_params(request)
+        client = self._get_client()
+        if "messages" in params:
+            stream = await client.chat.completions.create(**params)
+        else:
+            stream = await client.completions.create(**params)
 
-        # if we shift to TogetherAsyncClient, we won't need this wrapper
-        async def _to_async_generator():
-            if "messages" in params:
-                s = self._get_client().chat.completions.create(**params)
-            else:
-                s = self._get_client().completions.create(**params)
-            for chunk in s:
-                yield chunk
-
-        stream = _to_async_generator()
         async for chunk in process_chat_completion_stream_response(stream, request):
             yield chunk
 
@@ -240,7 +236,8 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
         assert all(not content_has_media(content) for content in contents), (
             "Together does not support media for embeddings"
         )
-        r = self._get_client().embeddings.create(
+        client = self._get_client()
+        r = await client.embeddings.create(
             model=model.provider_resource_id,
             input=[interleaved_content_as_str(content) for content in contents],
         )

From 0e3c0cf8ded6769a4d10a71befaa9f4689af5ac0 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Mon, 10 Mar 2025 15:25:23 -0700
Subject: [PATCH 48/58] fix: server logging (#1521)

Summary:

Test Plan:

ERROR 2025-03-10 10:53:00,804 __main__:239 server: Error executing
endpoint route='/v1/inference/chat-completion'
         method='post'
---
 llama_stack/distribution/server/server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index 347d88a2c..f819d446f 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -236,7 +236,7 @@ def create_dynamic_typed_route(func: Any, method: str, route: str):
                     value = func(**kwargs)
                     return await maybe_await(value)
             except Exception as e:
-                logger.exception("Error executing endpoint %s", method, route)
+                logger.exception(f"Error executing endpoint {route=} {method=}")
                 raise translate_exception(e) from e
 
     sig = inspect.signature(func)

From a64021bb4789e20bd7ed4322bfa4fe0b583724a5 Mon Sep 17 00:00:00 2001
From: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
Date: Mon, 10 Mar 2025 18:29:08 -0400
Subject: [PATCH 49/58] fix: Disable async loop warning messages during test
 run (#1526)

# What does this PR do?

The test class by default enables debug mode, which produces some
unexpected warnings like:

```
tests/unit/models/test_prompt_adapter.py::PrepareMessagesTests::test_completion_message_encoding
WARNING  2025-03-10 20:41:48,577 asyncio:1904 uncategorized: Executing <Task pending name='Task-1'
  coro=<IsolatedAsyncioTestCase._asyncioLoopRunner() running at
  /home/ec2-user/.local/share/uv/python/cpython-3.10.16-linux-x86_64-gnu/lib/python3.10/unittest/async_case.py:95
  > wait_for=<Future pending cb=[Task.task_wakeup()] created at
  /home/ec2-user/.local/share/uv/python/cpython-3.10.16-linux-x86_64-gnu/lib/python3.10/asyncio/base_events.py:42
  9> created at
  /home/ec2-user/.local/share/uv/python/cpython-3.10.16-linux-x86_64-gnu/lib/python3.10/unittest/async_case.py:11
  7> took 0.231 seconds
PASSED
```

I suggest we disable these since they are not very useful and can
confuse other developers.

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan

Run tests. The warnings are no longer seen.

[//]: # (## Documentation)

Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
---
 tests/unit/models/test_prompt_adapter.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/unit/models/test_prompt_adapter.py b/tests/unit/models/test_prompt_adapter.py
index 2a6dbb561..c3755e2cb 100644
--- a/tests/unit/models/test_prompt_adapter.py
+++ b/tests/unit/models/test_prompt_adapter.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import asyncio
 import unittest
 
 from llama_stack.apis.inference import (
@@ -31,6 +32,9 @@ MODEL3_2 = "Llama3.2-3B-Instruct"
 
 
 class PrepareMessagesTests(unittest.IsolatedAsyncioTestCase):
+    async def asyncSetUp(self):
+        asyncio.get_running_loop().set_debug(False)
+
     async def test_system_default(self):
         content = "Hello !"
         request = ChatCompletionRequest(

From 7559b4055ed221e2a1e0130369bec5a70d5cad43 Mon Sep 17 00:00:00 2001
From: Charlie Doern <cdoern@redhat.com>
Date: Mon, 10 Mar 2025 18:29:40 -0400
Subject: [PATCH 50/58] chore: add color to Env Variable message (#1525)

# What does this PR do?

currently the `"Environment variable LLAMA_STACK_LOGGING found"` message
is printed with no color switch to cprint and highlight in yellow for
visibility

Signed-off-by: Charlie Doern <cdoern@redhat.com>
---
 llama_stack/log.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llama_stack/log.py b/llama_stack/log.py
index 175427f5c..9b9f5c5d8 100644
--- a/llama_stack/log.py
+++ b/llama_stack/log.py
@@ -12,6 +12,7 @@ from typing import Dict
 from rich.console import Console
 from rich.errors import MarkupError
 from rich.logging import RichHandler
+from termcolor import cprint
 
 # Default log level
 DEFAULT_LOG_LEVEL = logging.INFO
@@ -176,7 +177,7 @@ def get_logger(name: str, category: str = "uncategorized") -> logging.LoggerAdap
 
 env_config = os.environ.get("LLAMA_STACK_LOGGING", "")
 if env_config:
-    print(f"Environment variable LLAMA_STACK_LOGGING found: {env_config}")
+    cprint(f"Environment variable LLAMA_STACK_LOGGING found: {env_config}", "yellow")
     _category_levels.update(parse_environment_config(env_config))
 
 setup_logging(_category_levels)

From 201a7567efab7993b17171720a73c2b274fe0dd0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Mon, 10 Mar 2025 23:36:18 +0100
Subject: [PATCH 51/58] test: add inspect unit test (#1417)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

Add unit tests for the inspect endpoint.

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan

$ ollama run llama3.2:3b-instruct-fp16 --keepalive=60m &
$ LLAMA_STACK_CONFIG=./llama_stack/templates/ollama/run.yaml uv run
pytest -v -s tests/integration/inspect/test_inspect.py

/Users/leseb/Documents/AI/llama-stack/.venv/lib/python3.10/site-packages/pytest_asyncio/plugin.py:207:
PytestDeprecationWarning: The configuration option
"asyncio_default_fixture_loop_scope" is unset.
The event loop scope for asynchronous fixtures will default to the
fixture caching scope. Future versions of pytest-asyncio will default
the loop scope for asynchronous fixtures to function scope. Set the
default fixture loop scope explicitly in order to avoid unexpected
behavior in the future. Valid fixture loop scopes are: "function",
"class", "module", "package", "session"


warnings.warn(PytestDeprecationWarning(_DEFAULT_FIXTURE_LOOP_SCOPE_UNSET))
============================================== test session starts
==============================================
platform darwin -- Python 3.10.16, pytest-8.3.4, pluggy-1.5.0 --
/Users/leseb/Documents/AI/llama-stack/.venv/bin/python3
cachedir: .pytest_cache
metadata: {'Python': '3.10.16', 'Platform':
'macOS-15.3.1-arm64-arm-64bit', 'Packages': {'pytest': '8.3.4',
'pluggy': '1.5.0'}, 'Plugins': {'html': '4.1.1', 'metadata': '3.1.1',
'asyncio': '0.25.3', 'anyio': '4.8.0', 'nbval': '0.11.0'}}
rootdir: /Users/leseb/Documents/AI/llama-stack
configfile: pyproject.toml
plugins: html-4.1.1, metadata-3.1.1, asyncio-0.25.3, anyio-4.8.0,
nbval-0.11.0
asyncio: mode=strict, asyncio_default_fixture_loop_scope=None
collected 2 items


tests/integration/inspect/test_inspect.py::TestInspect::test_health[txt=8B]
PASSED

tests/integration/inspect/test_inspect.py::TestInspect::test_version[txt=8B]
PASSED

========================================= 2 passed, 3 warnings in 2.26s
===================================
```

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 tests/integration/inspect/__init__.py     |  5 +++++
 tests/integration/inspect/test_inspect.py | 24 +++++++++++++++++++++++
 2 files changed, 29 insertions(+)
 create mode 100644 tests/integration/inspect/__init__.py
 create mode 100644 tests/integration/inspect/test_inspect.py

diff --git a/tests/integration/inspect/__init__.py b/tests/integration/inspect/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/tests/integration/inspect/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/tests/integration/inspect/test_inspect.py b/tests/integration/inspect/test_inspect.py
new file mode 100644
index 000000000..da704178d
--- /dev/null
+++ b/tests/integration/inspect/test_inspect.py
@@ -0,0 +1,24 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import pytest
+from llama_stack_client import LlamaStackClient
+
+from llama_stack import LlamaStackAsLibraryClient
+
+
+class TestInspect:
+    @pytest.mark.asyncio
+    def test_health(self, llama_stack_client: LlamaStackAsLibraryClient | LlamaStackClient):
+        health = llama_stack_client.inspect.health()
+        assert health is not None
+        assert health.status == "OK"
+
+    @pytest.mark.asyncio
+    def test_version(self, llama_stack_client: LlamaStackAsLibraryClient | LlamaStackClient):
+        version = llama_stack_client.inspect.version()
+        assert version is not None
+        assert version.version is not None

From 91b1b92908a2aa330aa9feda956c53be4e294e80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Mon, 10 Mar 2025 23:43:16 +0100
Subject: [PATCH 52/58] build: revamp "test" dependencies from pyproject
 (#1468)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

The `test` section has been updated to include only the essential
dependencies needed for running integration tests, which are shared
across all providers. If a provider requires additional dependencies,
please add them to your environment separately. When using uv to
run your tests, you can specify extra dependencies with the
`--with` flag.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 .github/workflows/unit-tests.yml |   2 +-
 pyproject.toml                   |  14 +-
 uv.lock                          | 938 ++++++++++++++++++++++++++++---
 3 files changed, 873 insertions(+), 81 deletions(-)

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index dc17cbc51..075aa8527 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -23,7 +23,7 @@ jobs:
 
       - name: Run unit tests
         run: |
-          uv run -p 3.10.16 --with . --with ".[dev]" --with ".[test]" pytest --cov=. -s -v tests/unit/ --junitxml=pytest-report.xml
+          uv run -p 3.10.16 --with . --with ".[dev]" --with ".[unit]" pytest --cov=. -s -v tests/unit/ --junitxml=pytest-report.xml
 
       - name: Upload test results
         if: always()
diff --git a/pyproject.toml b/pyproject.toml
index b2412bee9..b3ebc45dd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,20 +55,24 @@ dev = [
     "fastapi",
     "ruamel.yaml",      # needed for openapi generator
 ]
+# These are the dependencies required for running unit tests.
+unit = ["sqlite-vec", "openai", "aiosqlite", "pypdf", "chardet"]
+# These are the core dependencies required for running integration tests. They are shared across all
+# providers. If a provider requires additional dependencies, please add them to your environment
+# separately. If you are using "uv" to execute your tests, you can use the "--with" flag to specify extra
+# dependencies.
 test = [
     "openai",
     "aiosqlite",
-    "sqlite-vec",
-    "ollama",
     "torch>=2.6.0",
-    "fairscale>=0.4.13",
     "torchvision>=0.21.0",
-    "lm-format-enforcer>=0.10.9",
-    "groq",
     "opentelemetry-sdk",
     "opentelemetry-exporter-otlp-proto-http",
     "chardet",
     "pypdf",
+    "mcp",
+    "datasets",
+    "autoevals",
 ]
 docs = [
     "sphinx-autobuild",
diff --git a/uv.lock b/uv.lock
index db48f9876..9ec3680f8 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,4 +1,5 @@
 version = 1
+revision = 1
 requires-python = ">=3.10"
 resolution-markers = [
     "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
@@ -12,6 +13,109 @@ resolution-markers = [
     "python_full_version >= '3.12' and sys_platform == 'darwin'",
 ]
 
+[[package]]
+name = "aiohappyeyeballs"
+version = "2.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a2/0c/458958007041f4b4de2d307e6b75d9e7554dad0baf26fe7a48b741aac126/aiohappyeyeballs-2.5.0.tar.gz", hash = "sha256:18fde6204a76deeabc97c48bdd01d5801cfda5d6b9c8bbeb1aaaee9d648ca191", size = 22494 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1b/9a/e4886864ce06e1579bd428208127fbdc0d62049c751e4e9e3b509c0059dc/aiohappyeyeballs-2.5.0-py3-none-any.whl", hash = "sha256:0850b580748c7071db98bffff6d4c94028d0d3035acc20fd721a0ce7e8cac35d", size = 15128 },
+]
+
+[[package]]
+name = "aiohttp"
+version = "3.11.13"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohappyeyeballs" },
+    { name = "aiosignal" },
+    { name = "async-timeout", marker = "python_full_version < '3.11'" },
+    { name = "attrs" },
+    { name = "frozenlist" },
+    { name = "multidict" },
+    { name = "propcache" },
+    { name = "yarl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b3/3f/c4a667d184c69667b8f16e0704127efc5f1e60577df429382b4d95fd381e/aiohttp-3.11.13.tar.gz", hash = "sha256:8ce789231404ca8fff7f693cdce398abf6d90fd5dae2b1847477196c243b1fbb", size = 7674284 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f2/49/18bde4fbe1f98a12fb548741e65b27c5f0991c1af4ad15c86b537a4ce94a/aiohttp-3.11.13-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a4fe27dbbeec445e6e1291e61d61eb212ee9fed6e47998b27de71d70d3e8777d", size = 708941 },
+    { url = "https://files.pythonhosted.org/packages/99/24/417e5ab7074f5c97c9a794b6acdc59f47f2231d43e4d5cec06150035e61e/aiohttp-3.11.13-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9e64ca2dbea28807f8484c13f684a2f761e69ba2640ec49dacd342763cc265ef", size = 468823 },
+    { url = "https://files.pythonhosted.org/packages/76/93/159d3a2561bc6d64d32f779d08b17570b1c5fe55b985da7e2df9b3a4ff8f/aiohttp-3.11.13-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9840be675de208d1f68f84d578eaa4d1a36eee70b16ae31ab933520c49ba1325", size = 455984 },
+    { url = "https://files.pythonhosted.org/packages/18/bc/ed0dce45da90d4618ae14e677abbd704aec02e0f54820ea3815c156f0759/aiohttp-3.11.13-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28a772757c9067e2aee8a6b2b425d0efaa628c264d6416d283694c3d86da7689", size = 1585022 },
+    { url = "https://files.pythonhosted.org/packages/75/10/c1e6d59030fcf04ccc253193607b5b7ced0caffd840353e109c51134e5e9/aiohttp-3.11.13-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b88aca5adbf4625e11118df45acac29616b425833c3be7a05ef63a6a4017bfdb", size = 1632761 },
+    { url = "https://files.pythonhosted.org/packages/2d/8e/da1a20fbd2c961f824dc8efeb8d31c32ed4af761c87de83032ad4c4f5237/aiohttp-3.11.13-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ce10ddfbe26ed5856d6902162f71b8fe08545380570a885b4ab56aecfdcb07f4", size = 1668720 },
+    { url = "https://files.pythonhosted.org/packages/fa/9e/d0bbdc82236c3fe43b28b3338a13ef9b697b0f7a875b33b950b975cab1f6/aiohttp-3.11.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa48dac27f41b36735c807d1ab093a8386701bbf00eb6b89a0f69d9fa26b3671", size = 1589941 },
+    { url = "https://files.pythonhosted.org/packages/ed/14/248ed0385baeee854e495ca7f33b48bb151d1b226ddbf1585bdeb2301fbf/aiohttp-3.11.13-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:89ce611b1eac93ce2ade68f1470889e0173d606de20c85a012bfa24be96cf867", size = 1544978 },
+    { url = "https://files.pythonhosted.org/packages/20/b0/b2ad9d24fe85db8330034ac45dde67799af40ca2363c0c9b30126e204ef3/aiohttp-3.11.13-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:78e4dd9c34ec7b8b121854eb5342bac8b02aa03075ae8618b6210a06bbb8a115", size = 1529641 },
+    { url = "https://files.pythonhosted.org/packages/11/c6/03bdcb73a67a380b9593d52613ea88edd21ddc4ff5aaf06d4f807dfa2220/aiohttp-3.11.13-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:66047eacbc73e6fe2462b77ce39fc170ab51235caf331e735eae91c95e6a11e4", size = 1558027 },
+    { url = "https://files.pythonhosted.org/packages/0d/ae/e45491c8ca4d1e30ff031fb25b44842e16c326f8467026c3eb2a9c167608/aiohttp-3.11.13-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5ad8f1c19fe277eeb8bc45741c6d60ddd11d705c12a4d8ee17546acff98e0802", size = 1536991 },
+    { url = "https://files.pythonhosted.org/packages/19/89/10eb37351dd2b52928a54768a70a58171e43d7914685fe3feec8f681d905/aiohttp-3.11.13-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:64815c6f02e8506b10113ddbc6b196f58dbef135751cc7c32136df27b736db09", size = 1607848 },
+    { url = "https://files.pythonhosted.org/packages/a4/fd/492dec170df6ea57bef4bcd26374befdc170b10ba9ac7f51a0214943c20a/aiohttp-3.11.13-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:967b93f21b426f23ca37329230d5bd122f25516ae2f24a9cea95a30023ff8283", size = 1629208 },
+    { url = "https://files.pythonhosted.org/packages/70/46/ef8a02cb171d4779ca1632bc8ac0c5bb89729b091e2a3f4b895d688146b5/aiohttp-3.11.13-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cf1f31f83d16ec344136359001c5e871915c6ab685a3d8dee38e2961b4c81730", size = 1564684 },
+    { url = "https://files.pythonhosted.org/packages/8a/03/b1b552d1112b72da94bd1f9f5efb8adbcbbafaa8d495fc0924cd80493f17/aiohttp-3.11.13-cp310-cp310-win32.whl", hash = "sha256:00c8ac69e259c60976aa2edae3f13d9991cf079aaa4d3cd5a49168ae3748dee3", size = 416982 },
+    { url = "https://files.pythonhosted.org/packages/b0/2d/b6be8e7905ceba64121268ce28208bafe508a742c1467bf636a41d152284/aiohttp-3.11.13-cp310-cp310-win_amd64.whl", hash = "sha256:90d571c98d19a8b6e793b34aa4df4cee1e8fe2862d65cc49185a3a3d0a1a3996", size = 442389 },
+    { url = "https://files.pythonhosted.org/packages/3b/93/8e012ae31ff1bda5d43565d6f9e0bad325ba6f3f2d78f298bd39645be8a3/aiohttp-3.11.13-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6b35aab22419ba45f8fc290d0010898de7a6ad131e468ffa3922b1b0b24e9d2e", size = 709013 },
+    { url = "https://files.pythonhosted.org/packages/d8/be/fc7c436678ffe547d038319add8e44fd5e33090158752e5c480aed51a8d0/aiohttp-3.11.13-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f81cba651db8795f688c589dd11a4fbb834f2e59bbf9bb50908be36e416dc760", size = 468896 },
+    { url = "https://files.pythonhosted.org/packages/d9/1c/56906111ac9d4dab4baab43c89d35d5de1dbb38085150257895005b08bef/aiohttp-3.11.13-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f55d0f242c2d1fcdf802c8fabcff25a9d85550a4cf3a9cf5f2a6b5742c992839", size = 455968 },
+    { url = "https://files.pythonhosted.org/packages/ba/16/229d36ed27c2bb350320364efb56f906af194616cc15fc5d87f3ef21dbef/aiohttp-3.11.13-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4bea08a6aad9195ac9b1be6b0c7e8a702a9cec57ce6b713698b4a5afa9c2e33", size = 1686082 },
+    { url = "https://files.pythonhosted.org/packages/3a/44/78fd174509c56028672e5dfef886569cfa1fced0c5fd5c4480426db19ac9/aiohttp-3.11.13-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c6070bcf2173a7146bb9e4735b3c62b2accba459a6eae44deea0eb23e0035a23", size = 1744056 },
+    { url = "https://files.pythonhosted.org/packages/a3/11/325145c6dce8124b5caadbf763e908f2779c14bb0bc5868744d1e5cb9cb7/aiohttp-3.11.13-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:718d5deb678bc4b9d575bfe83a59270861417da071ab44542d0fcb6faa686636", size = 1785810 },
+    { url = "https://files.pythonhosted.org/packages/95/de/faba18a0af09969e10eb89fdbd4cb968bea95e75449a7fa944d4de7d1d2f/aiohttp-3.11.13-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f6b2c5b4a4d22b8fb2c92ac98e0747f5f195e8e9448bfb7404cd77e7bfa243f", size = 1675540 },
+    { url = "https://files.pythonhosted.org/packages/ea/53/0437c46e960b79ae3b1ff74c1ec12f04bf4f425bd349c8807acb38aae3d7/aiohttp-3.11.13-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:747ec46290107a490d21fe1ff4183bef8022b848cf9516970cb31de6d9460088", size = 1620210 },
+    { url = "https://files.pythonhosted.org/packages/04/2f/31769ed8e29cc22baaa4005bd2749a7fd0f61ad0f86024d38dff8e394cf6/aiohttp-3.11.13-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:01816f07c9cc9d80f858615b1365f8319d6a5fd079cd668cc58e15aafbc76a54", size = 1654399 },
+    { url = "https://files.pythonhosted.org/packages/b0/24/acb24571815b9a86a8261577c920fd84f819178c02a75b05b1a0d7ab83fb/aiohttp-3.11.13-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:a08ad95fcbd595803e0c4280671d808eb170a64ca3f2980dd38e7a72ed8d1fea", size = 1660424 },
+    { url = "https://files.pythonhosted.org/packages/91/45/30ca0c3ba5bbf7592eee7489eae30437736f7ff912eaa04cfdcf74edca8c/aiohttp-3.11.13-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:c97be90d70f7db3aa041d720bfb95f4869d6063fcdf2bb8333764d97e319b7d0", size = 1650415 },
+    { url = "https://files.pythonhosted.org/packages/86/8d/4d887df5e732cc70349243c2c9784911979e7bd71c06f9e7717b8a896f75/aiohttp-3.11.13-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:ab915a57c65f7a29353c8014ac4be685c8e4a19e792a79fe133a8e101111438e", size = 1733292 },
+    { url = "https://files.pythonhosted.org/packages/40/c9/bd950dac0a4c84d44d8da8d6e0f9c9511d45e02cf908a4e1fca591f46a25/aiohttp-3.11.13-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:35cda4e07f5e058a723436c4d2b7ba2124ab4e0aa49e6325aed5896507a8a42e", size = 1755536 },
+    { url = "https://files.pythonhosted.org/packages/32/04/aafeda6b4ed3693a44bb89eae002ebaa74f88b2265a7e68f8a31c33330f5/aiohttp-3.11.13-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:af55314407714fe77a68a9ccaab90fdb5deb57342585fd4a3a8102b6d4370080", size = 1693126 },
+    { url = "https://files.pythonhosted.org/packages/a1/4f/67729187e884b0f002a0317d2cc7962a5a0416cadc95ea88ba92477290d9/aiohttp-3.11.13-cp311-cp311-win32.whl", hash = "sha256:42d689a5c0a0c357018993e471893e939f555e302313d5c61dfc566c2cad6185", size = 416800 },
+    { url = "https://files.pythonhosted.org/packages/29/23/d98d491ca073ee92cc6a741be97b6b097fb06dacc5f95c0c9350787db549/aiohttp-3.11.13-cp311-cp311-win_amd64.whl", hash = "sha256:b73a2b139782a07658fbf170fe4bcdf70fc597fae5ffe75e5b67674c27434a9f", size = 442891 },
+    { url = "https://files.pythonhosted.org/packages/9a/a9/6657664a55f78db8767e396cc9723782ed3311eb57704b0a5dacfa731916/aiohttp-3.11.13-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:2eabb269dc3852537d57589b36d7f7362e57d1ece308842ef44d9830d2dc3c90", size = 705054 },
+    { url = "https://files.pythonhosted.org/packages/3b/06/f7df1fe062d16422f70af5065b76264f40b382605cf7477fa70553a9c9c1/aiohttp-3.11.13-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7b77ee42addbb1c36d35aca55e8cc6d0958f8419e458bb70888d8c69a4ca833d", size = 464440 },
+    { url = "https://files.pythonhosted.org/packages/22/3a/8773ea866735754004d9f79e501fe988bdd56cfac7fdecbc8de17fc093eb/aiohttp-3.11.13-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55789e93c5ed71832e7fac868167276beadf9877b85697020c46e9a75471f55f", size = 456394 },
+    { url = "https://files.pythonhosted.org/packages/7f/61/8e2f2af2327e8e475a2b0890f15ef0bbfd117e321cce1e1ed210df81bbac/aiohttp-3.11.13-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c929f9a7249a11e4aa5c157091cfad7f49cc6b13f4eecf9b747104befd9f56f2", size = 1682752 },
+    { url = "https://files.pythonhosted.org/packages/24/ed/84fce816bc8da39aa3f6c1196fe26e47065fea882b1a67a808282029c079/aiohttp-3.11.13-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d33851d85537bbf0f6291ddc97926a754c8f041af759e0aa0230fe939168852b", size = 1737375 },
+    { url = "https://files.pythonhosted.org/packages/d9/de/35a5ba9e3d21ebfda1ebbe66f6cc5cbb4d3ff9bd6a03e5e8a788954f8f27/aiohttp-3.11.13-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9229d8613bd8401182868fe95688f7581673e1c18ff78855671a4b8284f47bcb", size = 1793660 },
+    { url = "https://files.pythonhosted.org/packages/ff/fe/0f650a8c7c72c8a07edf8ab164786f936668acd71786dd5885fc4b1ca563/aiohttp-3.11.13-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:669dd33f028e54fe4c96576f406ebb242ba534dd3a981ce009961bf49960f117", size = 1692233 },
+    { url = "https://files.pythonhosted.org/packages/a8/20/185378b3483f968c6303aafe1e33b0da0d902db40731b2b2b2680a631131/aiohttp-3.11.13-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c1b20a1ace54af7db1f95af85da530fe97407d9063b7aaf9ce6a32f44730778", size = 1619708 },
+    { url = "https://files.pythonhosted.org/packages/a4/f9/d9c181750980b17e1e13e522d7e82a8d08d3d28a2249f99207ef5d8d738f/aiohttp-3.11.13-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5724cc77f4e648362ebbb49bdecb9e2b86d9b172c68a295263fa072e679ee69d", size = 1641802 },
+    { url = "https://files.pythonhosted.org/packages/50/c7/1cb46b72b1788710343b6e59eaab9642bd2422f2d87ede18b1996e0aed8f/aiohttp-3.11.13-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:aa36c35e94ecdb478246dd60db12aba57cfcd0abcad43c927a8876f25734d496", size = 1684678 },
+    { url = "https://files.pythonhosted.org/packages/71/87/89b979391de840c5d7c34e78e1148cc731b8aafa84b6a51d02f44b4c66e2/aiohttp-3.11.13-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:9b5b37c863ad5b0892cc7a4ceb1e435e5e6acd3f2f8d3e11fa56f08d3c67b820", size = 1646921 },
+    { url = "https://files.pythonhosted.org/packages/a7/db/a463700ac85b72f8cf68093e988538faaf4e865e3150aa165cf80ee29d6e/aiohttp-3.11.13-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:e06cf4852ce8c4442a59bae5a3ea01162b8fcb49ab438d8548b8dc79375dad8a", size = 1702493 },
+    { url = "https://files.pythonhosted.org/packages/b8/32/1084e65da3adfb08c7e1b3e94f3e4ded8bd707dee265a412bc377b7cd000/aiohttp-3.11.13-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:5194143927e494616e335d074e77a5dac7cd353a04755330c9adc984ac5a628e", size = 1735004 },
+    { url = "https://files.pythonhosted.org/packages/a0/bb/a634cbdd97ce5d05c2054a9a35bfc32792d7e4f69d600ad7e820571d095b/aiohttp-3.11.13-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:afcb6b275c2d2ba5d8418bf30a9654fa978b4f819c2e8db6311b3525c86fe637", size = 1694964 },
+    { url = "https://files.pythonhosted.org/packages/fd/cf/7d29db4e5c28ec316e5d2ac9ac9df0e2e278e9ea910e5c4205b9b64c2c42/aiohttp-3.11.13-cp312-cp312-win32.whl", hash = "sha256:7104d5b3943c6351d1ad7027d90bdd0ea002903e9f610735ac99df3b81f102ee", size = 411746 },
+    { url = "https://files.pythonhosted.org/packages/65/a9/13e69ad4fd62104ebd94617f9f2be58231b50bb1e6bac114f024303ac23b/aiohttp-3.11.13-cp312-cp312-win_amd64.whl", hash = "sha256:47dc018b1b220c48089b5b9382fbab94db35bef2fa192995be22cbad3c5730c8", size = 438078 },
+    { url = "https://files.pythonhosted.org/packages/87/dc/7d58d33cec693f1ddf407d4ab975445f5cb507af95600f137b81683a18d8/aiohttp-3.11.13-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:9862d077b9ffa015dbe3ce6c081bdf35135948cb89116e26667dd183550833d1", size = 698372 },
+    { url = "https://files.pythonhosted.org/packages/84/e7/5d88514c9e24fbc8dd6117350a8ec4a9314f4adae6e89fe32e3e639b0c37/aiohttp-3.11.13-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:fbfef0666ae9e07abfa2c54c212ac18a1f63e13e0760a769f70b5717742f3ece", size = 461057 },
+    { url = "https://files.pythonhosted.org/packages/96/1a/8143c48a929fa00c6324f85660cb0f47a55ed9385f0c1b72d4b8043acf8e/aiohttp-3.11.13-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:93a1f7d857c4fcf7cabb1178058182c789b30d85de379e04f64c15b7e88d66fb", size = 453340 },
+    { url = "https://files.pythonhosted.org/packages/2f/1c/b8010e4d65c5860d62681088e5376f3c0a940c5e3ca8989cae36ce8c3ea8/aiohttp-3.11.13-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba40b7ae0f81c7029583a338853f6607b6d83a341a3dcde8bed1ea58a3af1df9", size = 1665561 },
+    { url = "https://files.pythonhosted.org/packages/19/ed/a68c3ab2f92fdc17dfc2096117d1cfaa7f7bdded2a57bacbf767b104165b/aiohttp-3.11.13-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b5b95787335c483cd5f29577f42bbe027a412c5431f2f80a749c80d040f7ca9f", size = 1718335 },
+    { url = "https://files.pythonhosted.org/packages/27/4f/3a0b6160ce663b8ebdb65d1eedff60900cd7108838c914d25952fe2b909f/aiohttp-3.11.13-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7d474c5c1f0b9405c1565fafdc4429fa7d986ccbec7ce55bc6a330f36409cad", size = 1775522 },
+    { url = "https://files.pythonhosted.org/packages/0b/58/9da09291e19696c452e7224c1ce8c6d23a291fe8cd5c6b247b51bcda07db/aiohttp-3.11.13-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e83fb1991e9d8982b3b36aea1e7ad27ea0ce18c14d054c7a404d68b0319eebb", size = 1677566 },
+    { url = "https://files.pythonhosted.org/packages/3d/18/6184f2bf8bbe397acbbbaa449937d61c20a6b85765f48e5eddc6d84957fe/aiohttp-3.11.13-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4586a68730bd2f2b04a83e83f79d271d8ed13763f64b75920f18a3a677b9a7f0", size = 1603590 },
+    { url = "https://files.pythonhosted.org/packages/04/94/91e0d1ca0793012ccd927e835540aa38cca98bdce2389256ab813ebd64a3/aiohttp-3.11.13-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fe4eb0e7f50cdb99b26250d9328faef30b1175a5dbcfd6d0578d18456bac567", size = 1618688 },
+    { url = "https://files.pythonhosted.org/packages/71/85/d13c3ea2e48a10b43668305d4903838834c3d4112e5229177fbcc23a56cd/aiohttp-3.11.13-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:2a8a6bc19818ac3e5596310ace5aa50d918e1ebdcc204dc96e2f4d505d51740c", size = 1658053 },
+    { url = "https://files.pythonhosted.org/packages/12/6a/3242a35100de23c1e8d9e05e8605e10f34268dee91b00d9d1e278c58eb80/aiohttp-3.11.13-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:7f27eec42f6c3c1df09cfc1f6786308f8b525b8efaaf6d6bd76c1f52c6511f6a", size = 1616917 },
+    { url = "https://files.pythonhosted.org/packages/f5/b3/3f99b6f0a9a79590a7ba5655dbde8408c685aa462247378c977603464d0a/aiohttp-3.11.13-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:2a4a13dfbb23977a51853b419141cd0a9b9573ab8d3a1455c6e63561387b52ff", size = 1685872 },
+    { url = "https://files.pythonhosted.org/packages/8a/2e/99672181751f280a85e24fcb9a2c2469e8b1a0de1746b7b5c45d1eb9a999/aiohttp-3.11.13-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:02876bf2f69b062584965507b07bc06903c2dc93c57a554b64e012d636952654", size = 1715719 },
+    { url = "https://files.pythonhosted.org/packages/7a/cd/68030356eb9a7d57b3e2823c8a852709d437abb0fbff41a61ebc351b7625/aiohttp-3.11.13-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b992778d95b60a21c4d8d4a5f15aaab2bd3c3e16466a72d7f9bfd86e8cea0d4b", size = 1673166 },
+    { url = "https://files.pythonhosted.org/packages/03/61/425397a9a2839c609d09fdb53d940472f316a2dbeaa77a35b2628dae6284/aiohttp-3.11.13-cp313-cp313-win32.whl", hash = "sha256:507ab05d90586dacb4f26a001c3abf912eb719d05635cbfad930bdbeb469b36c", size = 410615 },
+    { url = "https://files.pythonhosted.org/packages/9c/54/ebb815bc0fe057d8e7a11c086c479e972e827082f39aeebc6019dd4f0862/aiohttp-3.11.13-cp313-cp313-win_amd64.whl", hash = "sha256:5ceb81a4db2decdfa087381b5fc5847aa448244f973e5da232610304e199e7b2", size = 436452 },
+]
+
+[[package]]
+name = "aiosignal"
+version = "1.3.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "frozenlist" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ba/b5/6d55e80f6d8a08ce22b982eafa278d823b541c925f11ee774b0b9c43473d/aiosignal-1.3.2.tar.gz", hash = "sha256:a8c255c66fafb1e499c9351d0bf32ff2d8a0321595ebac3b93713656d2436f54", size = 19424 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ec/6a/bc7e17a3e87a2985d3e8f4da4cd0f481060eb78fb08596c42be62c90a4d9/aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5", size = 7597 },
+]
+
 [[package]]
 name = "aiosqlite"
 version = "0.21.0"
@@ -75,6 +179,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2", size = 26918 },
 ]
 
+[[package]]
+name = "async-timeout"
+version = "5.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a5/ae/136395dfbfe00dfc94da3f3e136d0b13f394cba8f4841120e34226265780/async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3", size = 9274 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fe/ba/e2081de779ca30d473f21f5b30e0e737c438205440784c7dfc81efc2b029/async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c", size = 6233 },
+]
+
 [[package]]
 name = "attrs"
 version = "25.1.0"
@@ -84,6 +197,22 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fc/30/d4986a882011f9df997a55e6becd864812ccfcd821d64aac8570ee39f719/attrs-25.1.0-py3-none-any.whl", hash = "sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a", size = 63152 },
 ]
 
+[[package]]
+name = "autoevals"
+version = "0.0.122"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "braintrust-core" },
+    { name = "chevron" },
+    { name = "jsonschema" },
+    { name = "levenshtein" },
+    { name = "pyyaml" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/cc/bc/5b34ab9612af9943174fb2a0fb50313e65d5d49cbdf8f503c7321e88f852/autoevals-0.0.122.tar.gz", hash = "sha256:2ad79a0e8bc8532af3b2e54b7823c1c425f7085e2ccd274ef7d42e86aa877bbc", size = 39005 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/61/e3/8baebf334692a1d3babf72627c728497c115dfd894e8a5c04cb862df07c3/autoevals-0.0.122-py3-none-any.whl", hash = "sha256:c468f9da0bb7a91f6ee3369c9af18b8e0b0bcc57c59dca350dd31de611a08cd4", size = 41917 },
+]
+
 [[package]]
 name = "babel"
 version = "2.17.0"
@@ -142,6 +271,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ed/4d/1392562369b1139e741b30d624f09fe7091d17dd5579fae5732f044b12bb/blobfile-3.0.0-py3-none-any.whl", hash = "sha256:48ecc3307e622804bd8fe13bf6f40e6463c4439eba7a1f9ad49fd78aa63cc658", size = 75413 },
 ]
 
+[[package]]
+name = "braintrust-core"
+version = "0.0.58"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/16/13/ab46b7033b585ecafb636eda505e049bcae31f7b0335e7b83bb8250147ca/braintrust_core-0.0.58.tar.gz", hash = "sha256:213ef6515ea1b5802213035b12b66971b10f4ee55a6bc426e29370d2da063f6c", size = 3610 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b3/58/a255894436f3eca4a20611785a30a43b85bc75adf1b77f227e1e6d0cce0a/braintrust_core-0.0.58-py3-none-any.whl", hash = "sha256:fa272b70376d2c6692acf00ebd9fb9bae057b0c53b2b6a59a64850bf79757311", size = 4438 },
+]
+
 [[package]]
 name = "certifi"
 version = "2025.1.31"
@@ -287,6 +425,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/0e/f6/65ecc6878a89bb1c23a086ea335ad4bf21a588990c3f535a227b9eea9108/charset_normalizer-3.4.1-py3-none-any.whl", hash = "sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85", size = 49767 },
 ]
 
+[[package]]
+name = "chevron"
+version = "0.14.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/15/1f/ca74b65b19798895d63a6e92874162f44233467c9e7c1ed8afd19016ebe9/chevron-0.14.0.tar.gz", hash = "sha256:87613aafdf6d77b6a90ff073165a61ae5086e21ad49057aa0e53681601800ebf", size = 11440 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/52/93/342cc62a70ab727e093ed98e02a725d85b746345f05d2b5e5034649f4ec8/chevron-0.14.0-py3-none-any.whl", hash = "sha256:fbf996a709f8da2e745ef763f482ce2d311aa817d287593a5b990d6d6e4f0443", size = 11595 },
+]
+
 [[package]]
 name = "click"
 version = "8.1.8"
@@ -385,6 +532,31 @@ toml = [
     { name = "tomli", marker = "python_full_version <= '3.11'" },
 ]
 
+[[package]]
+name = "datasets"
+version = "3.3.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohttp" },
+    { name = "dill" },
+    { name = "filelock" },
+    { name = "fsspec", extra = ["http"] },
+    { name = "huggingface-hub" },
+    { name = "multiprocess" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "pandas" },
+    { name = "pyarrow" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "tqdm" },
+    { name = "xxhash" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/73/0c/dc3d172104e78e68f7a60386664adbf61db5d10c2246b31ddad06c2d1cb3/datasets-3.3.2.tar.gz", hash = "sha256:20901a97da870fb80b407ccc45f034a7ac99accd07da897ed42f11641bdb8c6e", size = 564352 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4c/37/22ef7675bef4ffe9577b937ddca2e22791534cbbe11c30714972a91532dc/datasets-3.3.2-py3-none-any.whl", hash = "sha256:fdaf3d5d70242621210b044e9b9b15a56e908bfc3e9d077bcf5605ac390f70bd", size = 485360 },
+]
+
 [[package]]
 name = "debugpy"
 version = "1.8.12"
@@ -431,6 +603,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl", hash = "sha256:bd5011788200372a32418f888e326a09ff80d0214bd961147cfed01b5c018eec", size = 9998 },
 ]
 
+[[package]]
+name = "dill"
+version = "0.3.8"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/17/4d/ac7ffa80c69ea1df30a8aa11b3578692a5118e7cd1aa157e3ef73b092d15/dill-0.3.8.tar.gz", hash = "sha256:3ebe3c479ad625c4553aca177444d89b486b1d84982eeacded644afc0cf797ca", size = 184847 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c9/7a/cef76fd8438a42f96db64ddaa85280485a9c395e7df3db8158cfec1eee34/dill-0.3.8-py3-none-any.whl", hash = "sha256:c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7", size = 116252 },
+]
+
 [[package]]
 name = "distlib"
 version = "0.3.9"
@@ -476,17 +657,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl", hash = "sha256:11387150cad388d62750327a53d3339fad4888b39a6fe233c3afbb54ecffd3aa", size = 26702 },
 ]
 
-[[package]]
-name = "fairscale"
-version = "0.4.13"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "numpy" },
-    { name = "torch", version = "2.6.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" },
-    { name = "torch", version = "2.6.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/c1/08/b3334d7b543ac10dcb129cef4f84723ab696725512f18d69ab3a784b0bf5/fairscale-0.4.13.tar.gz", hash = "sha256:1b797825c427f5dba92253fd0d8daa574e8bd651a2423497775fab1b30cfb768", size = 266261 }
-
 [[package]]
 name = "fastapi"
 version = "0.115.8"
@@ -528,6 +698,75 @@ dependencies = [
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/6b/b6/82c7e601d6d3c3278c40b7bd35e17e82aa227f050aa9f66cb7b7fce29471/fire-0.7.0.tar.gz", hash = "sha256:961550f07936eaf65ad1dc8360f2b2bf8408fad46abbfa4d2a3794f8d2a95cdf", size = 87189 }
 
+[[package]]
+name = "frozenlist"
+version = "1.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/8f/ed/0f4cec13a93c02c47ec32d81d11c0c1efbadf4a471e3f3ce7cad366cbbd3/frozenlist-1.5.0.tar.gz", hash = "sha256:81d5af29e61b9c8348e876d442253723928dce6433e0e76cd925cd83f1b4b817", size = 39930 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/54/79/29d44c4af36b2b240725dce566b20f63f9b36ef267aaaa64ee7466f4f2f8/frozenlist-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5b6a66c18b5b9dd261ca98dffcb826a525334b2f29e7caa54e182255c5f6a65a", size = 94451 },
+    { url = "https://files.pythonhosted.org/packages/47/47/0c999aeace6ead8a44441b4f4173e2261b18219e4ad1fe9a479871ca02fc/frozenlist-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d1b3eb7b05ea246510b43a7e53ed1653e55c2121019a97e60cad7efb881a97bb", size = 54301 },
+    { url = "https://files.pythonhosted.org/packages/8d/60/107a38c1e54176d12e06e9d4b5d755b677d71d1219217cee063911b1384f/frozenlist-1.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:15538c0cbf0e4fa11d1e3a71f823524b0c46299aed6e10ebb4c2089abd8c3bec", size = 52213 },
+    { url = "https://files.pythonhosted.org/packages/17/62/594a6829ac5679c25755362a9dc93486a8a45241394564309641425d3ff6/frozenlist-1.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e79225373c317ff1e35f210dd5f1344ff31066ba8067c307ab60254cd3a78ad5", size = 240946 },
+    { url = "https://files.pythonhosted.org/packages/7e/75/6c8419d8f92c80dd0ee3f63bdde2702ce6398b0ac8410ff459f9b6f2f9cb/frozenlist-1.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9272fa73ca71266702c4c3e2d4a28553ea03418e591e377a03b8e3659d94fa76", size = 264608 },
+    { url = "https://files.pythonhosted.org/packages/88/3e/82a6f0b84bc6fb7e0be240e52863c6d4ab6098cd62e4f5b972cd31e002e8/frozenlist-1.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:498524025a5b8ba81695761d78c8dd7382ac0b052f34e66939c42df860b8ff17", size = 261361 },
+    { url = "https://files.pythonhosted.org/packages/fd/85/14e5f9ccac1b64ff2f10c927b3ffdf88772aea875882406f9ba0cec8ad84/frozenlist-1.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:92b5278ed9d50fe610185ecd23c55d8b307d75ca18e94c0e7de328089ac5dcba", size = 231649 },
+    { url = "https://files.pythonhosted.org/packages/ee/59/928322800306f6529d1852323014ee9008551e9bb027cc38d276cbc0b0e7/frozenlist-1.5.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f3c8c1dacd037df16e85227bac13cca58c30da836c6f936ba1df0c05d046d8d", size = 241853 },
+    { url = "https://files.pythonhosted.org/packages/7d/bd/e01fa4f146a6f6c18c5d34cab8abdc4013774a26c4ff851128cd1bd3008e/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f2ac49a9bedb996086057b75bf93538240538c6d9b38e57c82d51f75a73409d2", size = 243652 },
+    { url = "https://files.pythonhosted.org/packages/a5/bd/e4771fd18a8ec6757033f0fa903e447aecc3fbba54e3630397b61596acf0/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e66cc454f97053b79c2ab09c17fbe3c825ea6b4de20baf1be28919460dd7877f", size = 241734 },
+    { url = "https://files.pythonhosted.org/packages/21/13/c83821fa5544af4f60c5d3a65d054af3213c26b14d3f5f48e43e5fb48556/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:5a3ba5f9a0dfed20337d3e966dc359784c9f96503674c2faf015f7fe8e96798c", size = 260959 },
+    { url = "https://files.pythonhosted.org/packages/71/f3/1f91c9a9bf7ed0e8edcf52698d23f3c211d8d00291a53c9f115ceb977ab1/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:6321899477db90bdeb9299ac3627a6a53c7399c8cd58d25da094007402b039ab", size = 262706 },
+    { url = "https://files.pythonhosted.org/packages/4c/22/4a256fdf5d9bcb3ae32622c796ee5ff9451b3a13a68cfe3f68e2c95588ce/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:76e4753701248476e6286f2ef492af900ea67d9706a0155335a40ea21bf3b2f5", size = 250401 },
+    { url = "https://files.pythonhosted.org/packages/af/89/c48ebe1f7991bd2be6d5f4ed202d94960c01b3017a03d6954dd5fa9ea1e8/frozenlist-1.5.0-cp310-cp310-win32.whl", hash = "sha256:977701c081c0241d0955c9586ffdd9ce44f7a7795df39b9151cd9a6fd0ce4cfb", size = 45498 },
+    { url = "https://files.pythonhosted.org/packages/28/2f/cc27d5f43e023d21fe5c19538e08894db3d7e081cbf582ad5ed366c24446/frozenlist-1.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:189f03b53e64144f90990d29a27ec4f7997d91ed3d01b51fa39d2dbe77540fd4", size = 51622 },
+    { url = "https://files.pythonhosted.org/packages/79/43/0bed28bf5eb1c9e4301003b74453b8e7aa85fb293b31dde352aac528dafc/frozenlist-1.5.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:fd74520371c3c4175142d02a976aee0b4cb4a7cc912a60586ffd8d5929979b30", size = 94987 },
+    { url = "https://files.pythonhosted.org/packages/bb/bf/b74e38f09a246e8abbe1e90eb65787ed745ccab6eaa58b9c9308e052323d/frozenlist-1.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2f3f7a0fbc219fb4455264cae4d9f01ad41ae6ee8524500f381de64ffaa077d5", size = 54584 },
+    { url = "https://files.pythonhosted.org/packages/2c/31/ab01375682f14f7613a1ade30149f684c84f9b8823a4391ed950c8285656/frozenlist-1.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f47c9c9028f55a04ac254346e92977bf0f166c483c74b4232bee19a6697e4778", size = 52499 },
+    { url = "https://files.pythonhosted.org/packages/98/a8/d0ac0b9276e1404f58fec3ab6e90a4f76b778a49373ccaf6a563f100dfbc/frozenlist-1.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0996c66760924da6e88922756d99b47512a71cfd45215f3570bf1e0b694c206a", size = 276357 },
+    { url = "https://files.pythonhosted.org/packages/ad/c9/c7761084fa822f07dac38ac29f841d4587570dd211e2262544aa0b791d21/frozenlist-1.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a2fe128eb4edeabe11896cb6af88fca5346059f6c8d807e3b910069f39157869", size = 287516 },
+    { url = "https://files.pythonhosted.org/packages/a1/ff/cd7479e703c39df7bdab431798cef89dc75010d8aa0ca2514c5b9321db27/frozenlist-1.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1a8ea951bbb6cacd492e3948b8da8c502a3f814f5d20935aae74b5df2b19cf3d", size = 283131 },
+    { url = "https://files.pythonhosted.org/packages/59/a0/370941beb47d237eca4fbf27e4e91389fd68699e6f4b0ebcc95da463835b/frozenlist-1.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de537c11e4aa01d37db0d403b57bd6f0546e71a82347a97c6a9f0dcc532b3a45", size = 261320 },
+    { url = "https://files.pythonhosted.org/packages/b8/5f/c10123e8d64867bc9b4f2f510a32042a306ff5fcd7e2e09e5ae5100ee333/frozenlist-1.5.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c2623347b933fcb9095841f1cc5d4ff0b278addd743e0e966cb3d460278840d", size = 274877 },
+    { url = "https://files.pythonhosted.org/packages/fa/79/38c505601ae29d4348f21706c5d89755ceded02a745016ba2f58bd5f1ea6/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cee6798eaf8b1416ef6909b06f7dc04b60755206bddc599f52232606e18179d3", size = 269592 },
+    { url = "https://files.pythonhosted.org/packages/19/e2/39f3a53191b8204ba9f0bb574b926b73dd2efba2a2b9d2d730517e8f7622/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f5f9da7f5dbc00a604fe74aa02ae7c98bcede8a3b8b9666f9f86fc13993bc71a", size = 265934 },
+    { url = "https://files.pythonhosted.org/packages/d5/c9/3075eb7f7f3a91f1a6b00284af4de0a65a9ae47084930916f5528144c9dd/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:90646abbc7a5d5c7c19461d2e3eeb76eb0b204919e6ece342feb6032c9325ae9", size = 283859 },
+    { url = "https://files.pythonhosted.org/packages/05/f5/549f44d314c29408b962fa2b0e69a1a67c59379fb143b92a0a065ffd1f0f/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:bdac3c7d9b705d253b2ce370fde941836a5f8b3c5c2b8fd70940a3ea3af7f4f2", size = 287560 },
+    { url = "https://files.pythonhosted.org/packages/9d/f8/cb09b3c24a3eac02c4c07a9558e11e9e244fb02bf62c85ac2106d1eb0c0b/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03d33c2ddbc1816237a67f66336616416e2bbb6beb306e5f890f2eb22b959cdf", size = 277150 },
+    { url = "https://files.pythonhosted.org/packages/37/48/38c2db3f54d1501e692d6fe058f45b6ad1b358d82cd19436efab80cfc965/frozenlist-1.5.0-cp311-cp311-win32.whl", hash = "sha256:237f6b23ee0f44066219dae14c70ae38a63f0440ce6750f868ee08775073f942", size = 45244 },
+    { url = "https://files.pythonhosted.org/packages/ca/8c/2ddffeb8b60a4bce3b196c32fcc30d8830d4615e7b492ec2071da801b8ad/frozenlist-1.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:0cc974cc93d32c42e7b0f6cf242a6bd941c57c61b618e78b6c0a96cb72788c1d", size = 51634 },
+    { url = "https://files.pythonhosted.org/packages/79/73/fa6d1a96ab7fd6e6d1c3500700963eab46813847f01ef0ccbaa726181dd5/frozenlist-1.5.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:31115ba75889723431aa9a4e77d5f398f5cf976eea3bdf61749731f62d4a4a21", size = 94026 },
+    { url = "https://files.pythonhosted.org/packages/ab/04/ea8bf62c8868b8eada363f20ff1b647cf2e93377a7b284d36062d21d81d1/frozenlist-1.5.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7437601c4d89d070eac8323f121fcf25f88674627505334654fd027b091db09d", size = 54150 },
+    { url = "https://files.pythonhosted.org/packages/d0/9a/8e479b482a6f2070b26bda572c5e6889bb3ba48977e81beea35b5ae13ece/frozenlist-1.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7948140d9f8ece1745be806f2bfdf390127cf1a763b925c4a805c603df5e697e", size = 51927 },
+    { url = "https://files.pythonhosted.org/packages/e3/12/2aad87deb08a4e7ccfb33600871bbe8f0e08cb6d8224371387f3303654d7/frozenlist-1.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:feeb64bc9bcc6b45c6311c9e9b99406660a9c05ca8a5b30d14a78555088b0b3a", size = 282647 },
+    { url = "https://files.pythonhosted.org/packages/77/f2/07f06b05d8a427ea0060a9cef6e63405ea9e0d761846b95ef3fb3be57111/frozenlist-1.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:683173d371daad49cffb8309779e886e59c2f369430ad28fe715f66d08d4ab1a", size = 289052 },
+    { url = "https://files.pythonhosted.org/packages/bd/9f/8bf45a2f1cd4aa401acd271b077989c9267ae8463e7c8b1eb0d3f561b65e/frozenlist-1.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7d57d8f702221405a9d9b40f9da8ac2e4a1a8b5285aac6100f3393675f0a85ee", size = 291719 },
+    { url = "https://files.pythonhosted.org/packages/41/d1/1f20fd05a6c42d3868709b7604c9f15538a29e4f734c694c6bcfc3d3b935/frozenlist-1.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30c72000fbcc35b129cb09956836c7d7abf78ab5416595e4857d1cae8d6251a6", size = 267433 },
+    { url = "https://files.pythonhosted.org/packages/af/f2/64b73a9bb86f5a89fb55450e97cd5c1f84a862d4ff90d9fd1a73ab0f64a5/frozenlist-1.5.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000a77d6034fbad9b6bb880f7ec073027908f1b40254b5d6f26210d2dab1240e", size = 283591 },
+    { url = "https://files.pythonhosted.org/packages/29/e2/ffbb1fae55a791fd6c2938dd9ea779509c977435ba3940b9f2e8dc9d5316/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5d7f5a50342475962eb18b740f3beecc685a15b52c91f7d975257e13e029eca9", size = 273249 },
+    { url = "https://files.pythonhosted.org/packages/2e/6e/008136a30798bb63618a114b9321b5971172a5abddff44a100c7edc5ad4f/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:87f724d055eb4785d9be84e9ebf0f24e392ddfad00b3fe036e43f489fafc9039", size = 271075 },
+    { url = "https://files.pythonhosted.org/packages/ae/f0/4e71e54a026b06724cec9b6c54f0b13a4e9e298cc8db0f82ec70e151f5ce/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:6e9080bb2fb195a046e5177f10d9d82b8a204c0736a97a153c2466127de87784", size = 285398 },
+    { url = "https://files.pythonhosted.org/packages/4d/36/70ec246851478b1c0b59f11ef8ade9c482ff447c1363c2bd5fad45098b12/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9b93d7aaa36c966fa42efcaf716e6b3900438632a626fb09c049f6a2f09fc631", size = 294445 },
+    { url = "https://files.pythonhosted.org/packages/37/e0/47f87544055b3349b633a03c4d94b405956cf2437f4ab46d0928b74b7526/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:52ef692a4bc60a6dd57f507429636c2af8b6046db8b31b18dac02cbc8f507f7f", size = 280569 },
+    { url = "https://files.pythonhosted.org/packages/f9/7c/490133c160fb6b84ed374c266f42800e33b50c3bbab1652764e6e1fc498a/frozenlist-1.5.0-cp312-cp312-win32.whl", hash = "sha256:29d94c256679247b33a3dc96cce0f93cbc69c23bf75ff715919332fdbb6a32b8", size = 44721 },
+    { url = "https://files.pythonhosted.org/packages/b1/56/4e45136ffc6bdbfa68c29ca56ef53783ef4c2fd395f7cbf99a2624aa9aaa/frozenlist-1.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:8969190d709e7c48ea386db202d708eb94bdb29207a1f269bab1196ce0dcca1f", size = 51329 },
+    { url = "https://files.pythonhosted.org/packages/da/3b/915f0bca8a7ea04483622e84a9bd90033bab54bdf485479556c74fd5eaf5/frozenlist-1.5.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:7a1a048f9215c90973402e26c01d1cff8a209e1f1b53f72b95c13db61b00f953", size = 91538 },
+    { url = "https://files.pythonhosted.org/packages/c7/d1/a7c98aad7e44afe5306a2b068434a5830f1470675f0e715abb86eb15f15b/frozenlist-1.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dd47a5181ce5fcb463b5d9e17ecfdb02b678cca31280639255ce9d0e5aa67af0", size = 52849 },
+    { url = "https://files.pythonhosted.org/packages/3a/c8/76f23bf9ab15d5f760eb48701909645f686f9c64fbb8982674c241fbef14/frozenlist-1.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1431d60b36d15cda188ea222033eec8e0eab488f39a272461f2e6d9e1a8e63c2", size = 50583 },
+    { url = "https://files.pythonhosted.org/packages/1f/22/462a3dd093d11df623179d7754a3b3269de3b42de2808cddef50ee0f4f48/frozenlist-1.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6482a5851f5d72767fbd0e507e80737f9c8646ae7fd303def99bfe813f76cf7f", size = 265636 },
+    { url = "https://files.pythonhosted.org/packages/80/cf/e075e407fc2ae7328155a1cd7e22f932773c8073c1fc78016607d19cc3e5/frozenlist-1.5.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:44c49271a937625619e862baacbd037a7ef86dd1ee215afc298a417ff3270608", size = 270214 },
+    { url = "https://files.pythonhosted.org/packages/a1/58/0642d061d5de779f39c50cbb00df49682832923f3d2ebfb0fedf02d05f7f/frozenlist-1.5.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:12f78f98c2f1c2429d42e6a485f433722b0061d5c0b0139efa64f396efb5886b", size = 273905 },
+    { url = "https://files.pythonhosted.org/packages/ab/66/3fe0f5f8f2add5b4ab7aa4e199f767fd3b55da26e3ca4ce2cc36698e50c4/frozenlist-1.5.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce3aa154c452d2467487765e3adc730a8c153af77ad84096bc19ce19a2400840", size = 250542 },
+    { url = "https://files.pythonhosted.org/packages/f6/b8/260791bde9198c87a465224e0e2bb62c4e716f5d198fc3a1dacc4895dbd1/frozenlist-1.5.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b7dc0c4338e6b8b091e8faf0db3168a37101943e687f373dce00959583f7439", size = 267026 },
+    { url = "https://files.pythonhosted.org/packages/2e/a4/3d24f88c527f08f8d44ade24eaee83b2627793fa62fa07cbb7ff7a2f7d42/frozenlist-1.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:45e0896250900b5aa25180f9aec243e84e92ac84bd4a74d9ad4138ef3f5c97de", size = 257690 },
+    { url = "https://files.pythonhosted.org/packages/de/9a/d311d660420b2beeff3459b6626f2ab4fb236d07afbdac034a4371fe696e/frozenlist-1.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:561eb1c9579d495fddb6da8959fd2a1fca2c6d060d4113f5844b433fc02f2641", size = 253893 },
+    { url = "https://files.pythonhosted.org/packages/c6/23/e491aadc25b56eabd0f18c53bb19f3cdc6de30b2129ee0bc39cd387cd560/frozenlist-1.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:df6e2f325bfee1f49f81aaac97d2aa757c7646534a06f8f577ce184afe2f0a9e", size = 267006 },
+    { url = "https://files.pythonhosted.org/packages/08/c4/ab918ce636a35fb974d13d666dcbe03969592aeca6c3ab3835acff01f79c/frozenlist-1.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:140228863501b44b809fb39ec56b5d4071f4d0aa6d216c19cbb08b8c5a7eadb9", size = 276157 },
+    { url = "https://files.pythonhosted.org/packages/c0/29/3b7a0bbbbe5a34833ba26f686aabfe982924adbdcafdc294a7a129c31688/frozenlist-1.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7707a25d6a77f5d27ea7dc7d1fc608aa0a478193823f88511ef5e6b8a48f9d03", size = 264642 },
+    { url = "https://files.pythonhosted.org/packages/ab/42/0595b3dbffc2e82d7fe658c12d5a5bafcd7516c6bf2d1d1feb5387caa9c1/frozenlist-1.5.0-cp313-cp313-win32.whl", hash = "sha256:31a9ac2b38ab9b5a8933b693db4939764ad3f299fcaa931a3e605bc3460e693c", size = 44914 },
+    { url = "https://files.pythonhosted.org/packages/17/c4/b7db1206a3fea44bf3b838ca61deb6f74424a8a5db1dd53ecb21da669be6/frozenlist-1.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:11aabdd62b8b9c4b84081a3c246506d1cddd2dd93ff0ad53ede5defec7886b28", size = 51167 },
+    { url = "https://files.pythonhosted.org/packages/c6/c8/a5be5b7550c10858fcf9b0ea054baccab474da77d37f1e828ce043a3a5d4/frozenlist-1.5.0-py3-none-any.whl", hash = "sha256:d994863bba198a4a518b467bb971c56e1db3f180a25c6cf7bb1949c267f748c3", size = 11901 },
+]
+
 [[package]]
 name = "fsspec"
 version = "2025.2.0"
@@ -537,6 +776,11 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e2/94/758680531a00d06e471ef649e4ec2ed6bf185356a7f9fbfbb7368a40bd49/fsspec-2025.2.0-py3-none-any.whl", hash = "sha256:9de2ad9ce1f85e1931858535bc882543171d197001a0a5eb2ddc04f1781ab95b", size = 184484 },
 ]
 
+[package.optional-dependencies]
+http = [
+    { name = "aiohttp" },
+]
+
 [[package]]
 name = "googleapis-common-protos"
 version = "1.67.0"
@@ -549,23 +793,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/89/30/2bd0eb03a7dee7727cd2ec643d1e992979e62d5e7443507381cce0455132/googleapis_common_protos-1.67.0-py2.py3-none-any.whl", hash = "sha256:579de760800d13616f51cf8be00c876f00a9f146d3e6510e19d1f4111758b741", size = 164985 },
 ]
 
-[[package]]
-name = "groq"
-version = "0.18.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "anyio" },
-    { name = "distro" },
-    { name = "httpx" },
-    { name = "pydantic" },
-    { name = "sniffio" },
-    { name = "typing-extensions" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/40/8c/e72c164474a88dfed6c7327ad53cb87ff11566b74b3a76d41dc7b94fc51c/groq-0.18.0.tar.gz", hash = "sha256:8e2ccfea406d68b3525af4b7c0e321fcb3d2a73fc60bb70b4156e6cd88c72f03", size = 117322 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/b0/6c/5a53d632b44ef7655ac8d9b34432e13160917f9307c94b1467efd34e336e/groq-0.18.0-py3-none-any.whl", hash = "sha256:81d5ac00057a45d8ce559d23ab5d3b3893011d1f12c35187ab35a9182d826ea6", size = 121911 },
-]
-
 [[package]]
 name = "h11"
 version = "0.14.0"
@@ -603,6 +830,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517 },
 ]
 
+[[package]]
+name = "httpx-sse"
+version = "0.4.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/4c/60/8f4281fa9bbf3c8034fd54c0e7412e66edbab6bc74c4996bd616f8d0406e/httpx-sse-0.4.0.tar.gz", hash = "sha256:1e81a3a3070ce322add1d3529ed42eb5f70817f45ed6ec915ab753f961139721", size = 12624 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e1/9b/a181f281f65d776426002f330c31849b86b31fc9d848db62e16f03ff739f/httpx_sse-0.4.0-py3-none-any.whl", hash = "sha256:f329af6eae57eaa2bdfd962b42524764af68075ea87370a2de920af5341e318f", size = 7819 },
+]
+
 [[package]]
 name = "huggingface-hub"
 version = "0.29.0"
@@ -669,15 +905,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892 },
 ]
 
-[[package]]
-name = "interegular"
-version = "0.3.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/dc/9d/8b6dde58a028a3962ce17e84d5fe73758df61378e00ef8ac3d85da34b0ff/interegular-0.3.3.tar.gz", hash = "sha256:d9b697b21b34884711399ba0f0376914b81899ce670032486d0d048344a76600", size = 24705 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c4/01/72d6472f80651673716d1deda2a5bbb633e563ecf94f4479da5519d69d25/interegular-0.3.3-py37-none-any.whl", hash = "sha256:b0c07007d48c89d6d19f7204972d369b2a77222722e126b6aa63aa721dc3b19c", size = 23635 },
-]
-
 [[package]]
 name = "ipykernel"
 version = "6.29.5"
@@ -864,6 +1091,89 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c9/fb/108ecd1fe961941959ad0ee4e12ee7b8b1477247f30b1fdfd83ceaf017f0/jupyter_core-5.7.2-py3-none-any.whl", hash = "sha256:4f7315d2f6b4bcf2e3e7cb6e46772eba760ae459cd1f59d29eb57b0a01bd7409", size = 28965 },
 ]
 
+[[package]]
+name = "levenshtein"
+version = "0.27.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "rapidfuzz" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7e/b3/b5f8011483ba9083a0bc74c4d58705e9cf465fbe55c948a1b1357d0a2aa8/levenshtein-0.27.1.tar.gz", hash = "sha256:3e18b73564cfc846eec94dd13fab6cb006b5d2e0cc56bad1fd7d5585881302e3", size = 382571 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b3/b1/9906a75b98dd9c008015a72d7658be53851e361a35492631edf1b1f334ab/levenshtein-0.27.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:13d6f617cb6fe63714c4794861cfaacd398db58a292f930edb7f12aad931dace", size = 174542 },
+    { url = "https://files.pythonhosted.org/packages/3b/57/e26e0164a93fb045316856603111d95538cac8224a3709e4ac96a6bb74f3/levenshtein-0.27.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ca9d54d41075e130c390e61360bec80f116b62d6ae973aec502e77e921e95334", size = 156367 },
+    { url = "https://files.pythonhosted.org/packages/6d/dd/92fcb71d48c1fe69c46c211156adafb8175037dc63e80e970106aef3f9d5/levenshtein-0.27.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2de1f822b5c9a20d10411f779dfd7181ce3407261436f8470008a98276a9d07f", size = 152189 },
+    { url = "https://files.pythonhosted.org/packages/5e/23/3f331f5fbfa93634126439cfc8c01b31f7ef1fbedb81663581e27a69da4d/levenshtein-0.27.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:81270392c2e45d1a7e1b3047c3a272d5e28bb4f1eff0137637980064948929b7", size = 184271 },
+    { url = "https://files.pythonhosted.org/packages/5a/76/d6ac541a1a80bdc5c98584a6a2d2301e677af4cb2e4092247207791b56a6/levenshtein-0.27.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2d30c3ea23a94dddd56dbe323e1fa8a29ceb24da18e2daa8d0abf78b269a5ad1", size = 185078 },
+    { url = "https://files.pythonhosted.org/packages/2d/ed/d0c5abe8cfcf6a7f2a4197e889e12b7a0c2145a0ef3354b1c000bf367305/levenshtein-0.27.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f3e0bea76695b9045bbf9ad5f67ad4cc01c11f783368f34760e068f19b6a6bc", size = 161505 },
+    { url = "https://files.pythonhosted.org/packages/f3/28/a5b78e1818211bc6407590876bbdcc6d79671e529a0c186780492c1f2136/levenshtein-0.27.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cdd190e468a68c31a5943368a5eaf4e130256a8707886d23ab5906a0cb98a43c", size = 246968 },
+    { url = "https://files.pythonhosted.org/packages/77/7f/981b903583956cb67b33bed39d9840ab5e4c7062bceec564b7bf2c3f6f49/levenshtein-0.27.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:7c3121314bb4b676c011c33f6a0ebb462cfdcf378ff383e6f9e4cca5618d0ba7", size = 1116000 },
+    { url = "https://files.pythonhosted.org/packages/75/1d/c4be47d5f436fd310373c5ebdf05828c1d95be9a44c3e94f29c40937b30c/levenshtein-0.27.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:f8ef378c873efcc5e978026b69b45342d841cd7a2f273447324f1c687cc4dc37", size = 1401162 },
+    { url = "https://files.pythonhosted.org/packages/91/e4/0b107676efe3ecd5fada1ed3a3bbddd4c829e2ef34e980b76374c116235b/levenshtein-0.27.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:ff18d78c5c16bea20876425e1bf5af56c25918fb01bc0f2532db1317d4c0e157", size = 1225141 },
+    { url = "https://files.pythonhosted.org/packages/29/f0/f3f88d766fdbb1d39fe98dc5527223bae099444e501550ae088c47ddd97b/levenshtein-0.27.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:13412ff805afbfe619d070280d1a76eb4198c60c5445cd5478bd4c7055bb3d51", size = 1419707 },
+    { url = "https://files.pythonhosted.org/packages/b8/1c/f51ac1db4064a85effa50df240250e413f428164301d836c312baf09381e/levenshtein-0.27.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a2adb9f263557f7fb13e19eb2f34595d86929a44c250b2fca6e9b65971e51e20", size = 1189284 },
+    { url = "https://files.pythonhosted.org/packages/e0/67/5ace76bc964b93ed6203a9f8c4dcde1a50e336468f7da3a21dd29febaf46/levenshtein-0.27.1-cp310-cp310-win32.whl", hash = "sha256:6278a33d2e0e909d8829b5a72191419c86dd3bb45b82399c7efc53dabe870c35", size = 88036 },
+    { url = "https://files.pythonhosted.org/packages/06/e0/d9737dbbe85842ddb300cb7974fc065edc56ec647652863f95ac1977d378/levenshtein-0.27.1-cp310-cp310-win_amd64.whl", hash = "sha256:5b602b8428ee5dc88432a55c5303a739ee2be7c15175bd67c29476a9d942f48e", size = 99922 },
+    { url = "https://files.pythonhosted.org/packages/27/b8/13e22789ab700db0da98f973a508643dbe2d25bd0fb5dc53239e0e2852c1/levenshtein-0.27.1-cp310-cp310-win_arm64.whl", hash = "sha256:48334081fddaa0c259ba01ee898640a2cf8ede62e5f7e25fefece1c64d34837f", size = 87846 },
+    { url = "https://files.pythonhosted.org/packages/22/84/110136e740655779aceb0da2399977362f21b2dbf3ea3646557f9c2237c4/levenshtein-0.27.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2e6f1760108319a108dceb2f02bc7cdb78807ad1f9c673c95eaa1d0fe5dfcaae", size = 174555 },
+    { url = "https://files.pythonhosted.org/packages/19/5b/176d96959f5c5969f356d8856f8e20d2e72f7e4879f6d1cda8e5c2ac2614/levenshtein-0.27.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c4ed8400d94ab348099395e050b8ed9dd6a5d6b5b9e75e78b2b3d0b5f5b10f38", size = 156286 },
+    { url = "https://files.pythonhosted.org/packages/2a/2d/a75abaafc8a46b0dc52ab14dc96708989a31799a02a4914f9210c3415f04/levenshtein-0.27.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7826efe51be8ff58bc44a633e022fdd4b9fc07396375a6dbc4945a3bffc7bf8f", size = 152413 },
+    { url = "https://files.pythonhosted.org/packages/9a/5f/533f4adf964b10817a1d0ecca978b3542b3b9915c96172d20162afe18bed/levenshtein-0.27.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ff5afb78719659d353055863c7cb31599fbea6865c0890b2d840ee40214b3ddb", size = 184236 },
+    { url = "https://files.pythonhosted.org/packages/02/79/e698623795e36e0d166a3aa1eac6fe1e446cac3a5c456664a95c351571d1/levenshtein-0.27.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:201dafd5c004cd52018560cf3213da799534d130cf0e4db839b51f3f06771de0", size = 185502 },
+    { url = "https://files.pythonhosted.org/packages/ac/94/76b64762f4af6e20bbab79713c4c48783240e6e502b2f52e5037ddda688a/levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5ddd59f3cfaec216811ee67544779d9e2d6ed33f79337492a248245d6379e3d", size = 161749 },
+    { url = "https://files.pythonhosted.org/packages/56/d0/d10eff9224c94a478078a469aaeb43471fdeddad035f443091224c7544b8/levenshtein-0.27.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6afc241d27ecf5b921063b796812c55b0115423ca6fa4827aa4b1581643d0a65", size = 246686 },
+    { url = "https://files.pythonhosted.org/packages/b2/8a/ebbeff74461da3230d00e8a8197480a2ea1a9bbb7dbc273214d7ea3896cb/levenshtein-0.27.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ee2e766277cceb8ca9e584ea03b8dc064449ba588d3e24c1923e4b07576db574", size = 1116616 },
+    { url = "https://files.pythonhosted.org/packages/1d/9b/e7323684f833ede13113fba818c3afe665a78b47d720afdeb2e530c1ecb3/levenshtein-0.27.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:920b23d6109453913ce78ec451bc402ff19d020ee8be4722e9d11192ec2fac6f", size = 1401483 },
+    { url = "https://files.pythonhosted.org/packages/ef/1d/9b6ab30ff086a33492d6f7de86a07050b15862ccf0d9feeccfbe26af52d8/levenshtein-0.27.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:560d7edba126e2eea3ac3f2f12e7bd8bc9c6904089d12b5b23b6dfa98810b209", size = 1225805 },
+    { url = "https://files.pythonhosted.org/packages/1b/07/ae2f31e87ff65ba4857e25192646f1f3c8cca83c2ac1c27e551215b7e1b6/levenshtein-0.27.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:8d5362b6c7aa4896dc0cb1e7470a4ad3c06124e0af055dda30d81d3c5549346b", size = 1419860 },
+    { url = "https://files.pythonhosted.org/packages/43/d2/dfcc5c22c07bab9be99f3f47a907be583bcd37bfd2eec57a205e59671019/levenshtein-0.27.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:65ba880815b0f80a80a293aeebac0fab8069d03ad2d6f967a886063458f9d7a1", size = 1188823 },
+    { url = "https://files.pythonhosted.org/packages/8b/96/713335623f8ab50eba0627c8685618dc3a985aedaaea9f492986b9443551/levenshtein-0.27.1-cp311-cp311-win32.whl", hash = "sha256:fcc08effe77fec0bc5b0f6f10ff20b9802b961c4a69047b5499f383119ddbe24", size = 88156 },
+    { url = "https://files.pythonhosted.org/packages/aa/ae/444d6e8ba9a35379a56926716f18bb2e77c6cf69e5324521fbe6885f14f6/levenshtein-0.27.1-cp311-cp311-win_amd64.whl", hash = "sha256:0ed402d8902be7df212ac598fc189f9b2d520817fdbc6a05e2ce44f7f3ef6857", size = 100399 },
+    { url = "https://files.pythonhosted.org/packages/80/c0/ff226897a238a2deb2ca2c00d658755a1aa01884b0ddc8f5d406cb5f2b0d/levenshtein-0.27.1-cp311-cp311-win_arm64.whl", hash = "sha256:7fdaab29af81a8eb981043737f42450efca64b9761ca29385487b29c506da5b5", size = 88033 },
+    { url = "https://files.pythonhosted.org/packages/0d/73/84a7126b9e6441c2547f1fbfd65f3c15c387d1fc04e0dd1d025a12107771/levenshtein-0.27.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:25fb540d8c55d1dc7bdc59b7de518ea5ed9df92eb2077e74bcb9bb6de7b06f69", size = 173953 },
+    { url = "https://files.pythonhosted.org/packages/8f/5c/06c01870c0cf336f9f29397bbfbfbbfd3a59918868716e7bb15828e89367/levenshtein-0.27.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f09cfab6387e9c908c7b37961c045e8e10eb9b7ec4a700367f8e080ee803a562", size = 156399 },
+    { url = "https://files.pythonhosted.org/packages/c7/4a/c1d3f27ec8b3fff5a96617251bf3f61c67972869ac0a0419558fc3e2cbe6/levenshtein-0.27.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dafa29c0e616f322b574e0b2aeb5b1ff2f8d9a1a6550f22321f3bd9bb81036e3", size = 151061 },
+    { url = "https://files.pythonhosted.org/packages/4d/8f/2521081e9a265891edf46aa30e1b59c1f347a452aed4c33baafbec5216fa/levenshtein-0.27.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:be7a7642ea64392fa1e6ef7968c2e50ef2152c60948f95d0793361ed97cf8a6f", size = 183119 },
+    { url = "https://files.pythonhosted.org/packages/1f/a0/a63e3bce6376127596d04be7f57e672d2f3d5f540265b1e30b9dd9b3c5a9/levenshtein-0.27.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:060b48c45ed54bcea9582ce79c6365b20a1a7473767e0b3d6be712fa3a22929c", size = 185352 },
+    { url = "https://files.pythonhosted.org/packages/17/8c/8352e992063952b38fb61d49bad8d193a4a713e7eeceb3ae74b719d7863d/levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:712f562c5e64dd0398d3570fe99f8fbb88acec7cc431f101cb66c9d22d74c542", size = 159879 },
+    { url = "https://files.pythonhosted.org/packages/69/b4/564866e2038acf47c3de3e9292fc7fc7cc18d2593fedb04f001c22ac6e15/levenshtein-0.27.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a6141ad65cab49aa4527a3342d76c30c48adb2393b6cdfeca65caae8d25cb4b8", size = 245005 },
+    { url = "https://files.pythonhosted.org/packages/ba/f9/7367f87e3a6eed282f3654ec61a174b4d1b78a7a73f2cecb91f0ab675153/levenshtein-0.27.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:799b8d73cda3265331116f62932f553804eae16c706ceb35aaf16fc2a704791b", size = 1116865 },
+    { url = "https://files.pythonhosted.org/packages/f5/02/b5b3bfb4b4cd430e9d110bad2466200d51c6061dae7c5a64e36047c8c831/levenshtein-0.27.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:ec99871d98e517e1cc4a15659c62d6ea63ee5a2d72c5ddbebd7bae8b9e2670c8", size = 1401723 },
+    { url = "https://files.pythonhosted.org/packages/ef/69/b93bccd093b3f06a99e67e11ebd6e100324735dc2834958ba5852a1b9fed/levenshtein-0.27.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:8799164e1f83588dbdde07f728ea80796ea72196ea23484d78d891470241b222", size = 1226276 },
+    { url = "https://files.pythonhosted.org/packages/ab/32/37dd1bc5ce866c136716619e6f7081d7078d7dd1c1da7025603dcfd9cf5f/levenshtein-0.27.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:583943813898326516ab451a83f734c6f07488cda5c361676150d3e3e8b47927", size = 1420132 },
+    { url = "https://files.pythonhosted.org/packages/4b/08/f3bc828dd9f0f8433b26f37c4fceab303186ad7b9b70819f2ccb493d99fc/levenshtein-0.27.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5bb22956af44bb4eade93546bf95be610c8939b9a9d4d28b2dfa94abf454fed7", size = 1189144 },
+    { url = "https://files.pythonhosted.org/packages/2d/54/5ecd89066cf579223d504abe3ac37ba11f63b01a19fd12591083acc00eb6/levenshtein-0.27.1-cp312-cp312-win32.whl", hash = "sha256:d9099ed1bcfa7ccc5540e8ad27b5dc6f23d16addcbe21fdd82af6440f4ed2b6d", size = 88279 },
+    { url = "https://files.pythonhosted.org/packages/53/79/4f8fabcc5aca9305b494d1d6c7a98482e90a855e0050ae9ff5d7bf4ab2c6/levenshtein-0.27.1-cp312-cp312-win_amd64.whl", hash = "sha256:7f071ecdb50aa6c15fd8ae5bcb67e9da46ba1df7bba7c6bf6803a54c7a41fd96", size = 100659 },
+    { url = "https://files.pythonhosted.org/packages/cb/81/f8e4c0f571c2aac2e0c56a6e0e41b679937a2b7013e79415e4aef555cff0/levenshtein-0.27.1-cp312-cp312-win_arm64.whl", hash = "sha256:83b9033a984ccace7703f35b688f3907d55490182fd39b33a8e434d7b2e249e6", size = 88168 },
+    { url = "https://files.pythonhosted.org/packages/c6/d3/30485fb9aee848542ee2d01aba85106a7f5da982ebeeffc619f70ea593c7/levenshtein-0.27.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ab00c2cae2889166afb7e1af64af2d4e8c1b126f3902d13ef3740df00e54032d", size = 173397 },
+    { url = "https://files.pythonhosted.org/packages/df/9f/40a81c54cfe74b22737710e654bd25ad934a675f737b60b24f84099540e0/levenshtein-0.27.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c27e00bc7527e282f7c437817081df8da4eb7054e7ef9055b851fa3947896560", size = 155787 },
+    { url = "https://files.pythonhosted.org/packages/df/98/915f4e24e21982b6eca2c0203546c160f4a83853fa6a2ac6e2b208a54afc/levenshtein-0.27.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5b07de42bfc051136cc8e7f1e7ba2cb73666aa0429930f4218efabfdc5837ad", size = 150013 },
+    { url = "https://files.pythonhosted.org/packages/80/93/9b0773107580416b9de14bf6a12bd1dd2b2964f7a9f6fb0e40723e1f0572/levenshtein-0.27.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fb11ad3c9dae3063405aa50d9c96923722ab17bb606c776b6817d70b51fd7e07", size = 181234 },
+    { url = "https://files.pythonhosted.org/packages/91/b1/3cd4f69af32d40de14808142cc743af3a1b737b25571bd5e8d2f46b885e0/levenshtein-0.27.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c5986fb46cb0c063305fd45b0a79924abf2959a6d984bbac2b511d3ab259f3f", size = 183697 },
+    { url = "https://files.pythonhosted.org/packages/bb/65/b691e502c6463f6965b7e0d8d84224c188aa35b53fbc85853c72a0e436c9/levenshtein-0.27.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75191e469269ddef2859bc64c4a8cfd6c9e063302766b5cb7e1e67f38cc7051a", size = 159964 },
+    { url = "https://files.pythonhosted.org/packages/0f/c0/89a922a47306a475fb6d8f2ab08668f143d3dc7dea4c39d09e46746e031c/levenshtein-0.27.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:51b3a7b2266933babc04e4d9821a495142eebd6ef709f90e24bc532b52b81385", size = 244759 },
+    { url = "https://files.pythonhosted.org/packages/b4/93/30283c6e69a6556b02e0507c88535df9613179f7b44bc49cdb4bc5e889a3/levenshtein-0.27.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bbac509794afc3e2a9e73284c9e3d0aab5b1d928643f42b172969c3eefa1f2a3", size = 1115955 },
+    { url = "https://files.pythonhosted.org/packages/0b/cf/7e19ea2c23671db02fbbe5a5a4aeafd1d471ee573a6251ae17008458c434/levenshtein-0.27.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:8d68714785178347ecb272b94e85cbf7e638165895c4dd17ab57e7742d8872ec", size = 1400921 },
+    { url = "https://files.pythonhosted.org/packages/e3/f7/fb42bfe2f3b46ef91f0fc6fa217b44dbeb4ef8c72a9c1917bbbe1cafc0f8/levenshtein-0.27.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:8ee74ee31a5ab8f61cd6c6c6e9ade4488dde1285f3c12207afc018393c9b8d14", size = 1225037 },
+    { url = "https://files.pythonhosted.org/packages/74/25/c86f8874ac7b0632b172d0d1622ed3ab9608a7f8fe85d41d632b16f5948e/levenshtein-0.27.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:f2441b6365453ec89640b85344afd3d602b0d9972840b693508074c613486ce7", size = 1420601 },
+    { url = "https://files.pythonhosted.org/packages/20/fe/ebfbaadcd90ea7dfde987ae95b5c11dc27c2c5d55a2c4ccbbe4e18a8af7b/levenshtein-0.27.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a9be39640a46d8a0f9be729e641651d16a62b2c07d3f4468c36e1cc66b0183b9", size = 1188241 },
+    { url = "https://files.pythonhosted.org/packages/2e/1a/aa6b07316e10781a6c5a5a8308f9bdc22213dc3911b959daa6d7ff654fc6/levenshtein-0.27.1-cp313-cp313-win32.whl", hash = "sha256:a520af67d976761eb6580e7c026a07eb8f74f910f17ce60e98d6e492a1f126c7", size = 88103 },
+    { url = "https://files.pythonhosted.org/packages/9d/7b/9bbfd417f80f1047a28d0ea56a9b38b9853ba913b84dd5998785c5f98541/levenshtein-0.27.1-cp313-cp313-win_amd64.whl", hash = "sha256:7dd60aa49c2d8d23e0ef6452c8329029f5d092f386a177e3385d315cabb78f2a", size = 100579 },
+    { url = "https://files.pythonhosted.org/packages/8b/01/5f3ff775db7340aa378b250e2a31e6b4b038809a24ff0a3636ef20c7ca31/levenshtein-0.27.1-cp313-cp313-win_arm64.whl", hash = "sha256:149cd4f0baf5884ac5df625b7b0d281721b15de00f447080e38f5188106e1167", size = 87933 },
+    { url = "https://files.pythonhosted.org/packages/25/ed/37e2d1f5e690d7376cd7e8bdd19411479ff352a3df9ab5f845dd680ef779/levenshtein-0.27.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:c92a222ab95b8d903eae6d5e7d51fe6c999be021b647715c18d04d0b0880f463", size = 170482 },
+    { url = "https://files.pythonhosted.org/packages/6d/9f/30b1144b9d1da74743e7d7cdf47575b7013c9767e608c7454dbd318aacd2/levenshtein-0.27.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:71afc36b4ee950fa1140aff22ffda9e5e23280285858e1303260dbb2eabf342d", size = 153106 },
+    { url = "https://files.pythonhosted.org/packages/b1/c5/18d0bec94a166cebaefa3db4beab9a7e0d75412b52e9626f5dce1ca8d149/levenshtein-0.27.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58b1daeebfc148a571f09cfe18c16911ea1eaaa9e51065c5f7e7acbc4b866afa", size = 150984 },
+    { url = "https://files.pythonhosted.org/packages/55/b4/4b80eb0c96caabdb683256cac9cc2cc9a73dee8ea80ab7cc3ee8aebd603f/levenshtein-0.27.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:105edcb14797d95c77f69bad23104314715a64cafbf4b0e79d354a33d7b54d8d", size = 158673 },
+    { url = "https://files.pythonhosted.org/packages/81/14/a43daefbc6d5e5561176150363cbac73003795b85ae136ffd4d0691af3fb/levenshtein-0.27.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d9c58fb1ef8bdc8773d705fbacf628e12c3bb63ee4d065dda18a76e86042444a", size = 244419 },
+    { url = "https://files.pythonhosted.org/packages/d0/55/34f133f4f0998d7335bd96b9d315dc888b118e48e999c3d2c621b84965b9/levenshtein-0.27.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:e52270591854af67217103955a36bd7436b57c801e3354e73ba44d689ed93697", size = 97932 },
+    { url = "https://files.pythonhosted.org/packages/7d/44/c5955d0b6830925559b00617d80c9f6e03a9b00c451835ee4da7010e71cd/levenshtein-0.27.1-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:909b7b6bce27a4ec90576c9a9bd9af5a41308dfecf364b410e80b58038277bbe", size = 170533 },
+    { url = "https://files.pythonhosted.org/packages/e7/3f/858572d68b33e13a9c154b99f153317efe68381bf63cc4e986e820935fc3/levenshtein-0.27.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d193a7f97b8c6a350e36ec58e41a627c06fa4157c3ce4b2b11d90cfc3c2ebb8f", size = 153119 },
+    { url = "https://files.pythonhosted.org/packages/d1/60/2bd8d001ea4eb53ca16faa7a649d56005ba22b1bcc2a4f1617ab27ed7e48/levenshtein-0.27.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:614be316e3c06118705fae1f717f9072d35108e5fd4e66a7dd0e80356135340b", size = 149576 },
+    { url = "https://files.pythonhosted.org/packages/e4/db/0580797e1e4ac26cf67761a235b29b49f62d2b175dbbc609882f2aecd4e4/levenshtein-0.27.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31fc0a5bb070722bdabb6f7e14955a294a4a968c68202d294699817f21545d22", size = 157445 },
+    { url = "https://files.pythonhosted.org/packages/f4/de/9c171c96d1f15c900086d7212b5543a85539e767689fc4933d14048ba1ec/levenshtein-0.27.1-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9415aa5257227af543be65768a80c7a75e266c3c818468ce6914812f88f9c3df", size = 243141 },
+    { url = "https://files.pythonhosted.org/packages/dc/1e/408fd10217eac0e43aea0604be22b4851a09e03d761d44d4ea12089dd70e/levenshtein-0.27.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:7987ef006a3cf56a4532bd4c90c2d3b7b4ca9ad3bf8ae1ee5713c4a3bdfda913", size = 98045 },
+]
+
 [[package]]
 name = "llama-stack"
 version = "0.1.6"
@@ -922,43 +1232,50 @@ docs = [
 ]
 test = [
     { name = "aiosqlite" },
+    { name = "autoevals" },
     { name = "chardet" },
-    { name = "fairscale" },
-    { name = "groq" },
-    { name = "lm-format-enforcer" },
-    { name = "ollama" },
+    { name = "datasets" },
+    { name = "mcp" },
     { name = "openai" },
     { name = "opentelemetry-exporter-otlp-proto-http" },
     { name = "opentelemetry-sdk" },
     { name = "pypdf" },
-    { name = "sqlite-vec" },
     { name = "torch", version = "2.6.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" },
     { name = "torch", version = "2.6.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin'" },
     { name = "torchvision", version = "0.21.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
     { name = "torchvision", version = "0.21.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
+unit = [
+    { name = "aiosqlite" },
+    { name = "chardet" },
+    { name = "openai" },
+    { name = "pypdf" },
+    { name = "sqlite-vec" },
+]
 
 [package.metadata]
 requires-dist = [
     { name = "aiosqlite", marker = "extra == 'test'" },
+    { name = "aiosqlite", marker = "extra == 'unit'" },
+    { name = "autoevals", marker = "extra == 'test'" },
     { name = "black", marker = "extra == 'dev'" },
     { name = "blobfile" },
     { name = "chardet", marker = "extra == 'test'" },
-    { name = "fairscale", marker = "extra == 'test'", specifier = ">=0.4.13" },
+    { name = "chardet", marker = "extra == 'unit'" },
+    { name = "datasets", marker = "extra == 'test'" },
     { name = "fastapi", marker = "extra == 'dev'" },
     { name = "fire" },
-    { name = "groq", marker = "extra == 'test'" },
     { name = "httpx" },
     { name = "huggingface-hub" },
     { name = "jinja2", specifier = ">=3.1.6" },
     { name = "jinja2", marker = "extra == 'codegen'", specifier = ">=3.1.6" },
     { name = "jsonschema" },
     { name = "llama-stack-client", specifier = ">=0.1.6" },
-    { name = "lm-format-enforcer", marker = "extra == 'test'", specifier = ">=0.10.9" },
+    { name = "mcp", marker = "extra == 'test'" },
     { name = "myst-parser", marker = "extra == 'docs'" },
     { name = "nbval", marker = "extra == 'dev'" },
-    { name = "ollama", marker = "extra == 'test'" },
     { name = "openai", marker = "extra == 'test'" },
+    { name = "openai", marker = "extra == 'unit'" },
     { name = "opentelemetry-exporter-otlp-proto-http", marker = "extra == 'test'" },
     { name = "opentelemetry-sdk", marker = "extra == 'test'" },
     { name = "pillow" },
@@ -967,6 +1284,7 @@ requires-dist = [
     { name = "pydantic", specifier = ">=2" },
     { name = "pydantic", marker = "extra == 'codegen'" },
     { name = "pypdf", marker = "extra == 'test'" },
+    { name = "pypdf", marker = "extra == 'unit'" },
     { name = "pytest", marker = "extra == 'dev'" },
     { name = "pytest-asyncio", marker = "extra == 'dev'" },
     { name = "pytest-cov", marker = "extra == 'dev'" },
@@ -986,7 +1304,7 @@ requires-dist = [
     { name = "sphinxcontrib-mermaid", marker = "extra == 'docs'" },
     { name = "sphinxcontrib-redoc", marker = "extra == 'docs'" },
     { name = "sphinxcontrib-video", marker = "extra == 'docs'" },
-    { name = "sqlite-vec", marker = "extra == 'test'" },
+    { name = "sqlite-vec", marker = "extra == 'unit'" },
     { name = "termcolor" },
     { name = "tiktoken" },
     { name = "tomli", marker = "extra == 'docs'" },
@@ -996,6 +1314,7 @@ requires-dist = [
     { name = "types-setuptools", marker = "extra == 'dev'" },
     { name = "uvicorn", marker = "extra == 'dev'" },
 ]
+provides-extras = ["dev", "unit", "test", "docs", "codegen"]
 
 [[package]]
 name = "llama-stack-client"
@@ -1021,21 +1340,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/38/51/1102914f819cf4412a5c9fd3f7dcc28175608e5f01ee164885972c3ec30b/llama_stack_client-0.1.6-py3-none-any.whl", hash = "sha256:708e20630d4e97a1cb03a19b933f4da6748cc857fe170998c392cf0f30f0f4c7", size = 373941 },
 ]
 
-[[package]]
-name = "lm-format-enforcer"
-version = "0.10.10"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "interegular" },
-    { name = "packaging" },
-    { name = "pydantic" },
-    { name = "pyyaml" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/9d/3f/1ec9e91208a2b8af28ef2caf096e70446d7b3c7218c891fffa899608bf08/lm_format_enforcer-0.10.10.tar.gz", hash = "sha256:b1ff9530ccf73097e35bded94737677c9768a235d74b26af8cd25414efdf85f5", size = 39393 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/32/55/9b91312b7b59903ffa2d1c4310cbeecfea0f8e8e12b154d7ad1d093d0b03/lm_format_enforcer-0.10.10-py3-none-any.whl", hash = "sha256:c5e4330c717780b046c77f46699f8a668cb2b806da540c0127da942538d13695", size = 44231 },
-]
-
 [[package]]
 name = "lxml"
 version = "5.3.1"
@@ -1200,6 +1504,25 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca", size = 9899 },
 ]
 
+[[package]]
+name = "mcp"
+version = "1.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "httpx" },
+    { name = "httpx-sse" },
+    { name = "pydantic" },
+    { name = "pydantic-settings" },
+    { name = "sse-starlette" },
+    { name = "starlette" },
+    { name = "uvicorn" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6b/b6/81e5f2490290351fc97bf46c24ff935128cb7d34d68e3987b522f26f7ada/mcp-1.3.0.tar.gz", hash = "sha256:f409ae4482ce9d53e7ac03f3f7808bcab735bdfc0fba937453782efb43882d45", size = 150235 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d0/d2/a9e87b506b2094f5aa9becc1af5178842701b27217fa43877353da2577e3/mcp-1.3.0-py3-none-any.whl", hash = "sha256:2829d67ce339a249f803f22eba5e90385eafcac45c94b00cab6cef7e8f217211", size = 70672 },
+]
+
 [[package]]
 name = "mdit-py-plugins"
 version = "0.4.2"
@@ -1230,6 +1553,96 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198 },
 ]
 
+[[package]]
+name = "multidict"
+version = "6.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d6/be/504b89a5e9ca731cd47487e91c469064f8ae5af93b7259758dcfc2b9c848/multidict-6.1.0.tar.gz", hash = "sha256:22ae2ebf9b0c69d206c003e2f6a914ea33f0a932d4aa16f236afc049d9958f4a", size = 64002 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/29/68/259dee7fd14cf56a17c554125e534f6274c2860159692a414d0b402b9a6d/multidict-6.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3380252550e372e8511d49481bd836264c009adb826b23fefcc5dd3c69692f60", size = 48628 },
+    { url = "https://files.pythonhosted.org/packages/50/79/53ba256069fe5386a4a9e80d4e12857ced9de295baf3e20c68cdda746e04/multidict-6.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:99f826cbf970077383d7de805c0681799491cb939c25450b9b5b3ced03ca99f1", size = 29327 },
+    { url = "https://files.pythonhosted.org/packages/ff/10/71f1379b05b196dae749b5ac062e87273e3f11634f447ebac12a571d90ae/multidict-6.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a114d03b938376557927ab23f1e950827c3b893ccb94b62fd95d430fd0e5cf53", size = 29689 },
+    { url = "https://files.pythonhosted.org/packages/71/45/70bac4f87438ded36ad4793793c0095de6572d433d98575a5752629ef549/multidict-6.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1c416351ee6271b2f49b56ad7f308072f6f44b37118d69c2cad94f3fa8a40d5", size = 126639 },
+    { url = "https://files.pythonhosted.org/packages/80/cf/17f35b3b9509b4959303c05379c4bfb0d7dd05c3306039fc79cf035bbac0/multidict-6.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b5d83030255983181005e6cfbac1617ce9746b219bc2aad52201ad121226581", size = 134315 },
+    { url = "https://files.pythonhosted.org/packages/ef/1f/652d70ab5effb33c031510a3503d4d6efc5ec93153562f1ee0acdc895a57/multidict-6.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3e97b5e938051226dc025ec80980c285b053ffb1e25a3db2a3aa3bc046bf7f56", size = 129471 },
+    { url = "https://files.pythonhosted.org/packages/a6/64/2dd6c4c681688c0165dea3975a6a4eab4944ea30f35000f8b8af1df3148c/multidict-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d618649d4e70ac6efcbba75be98b26ef5078faad23592f9b51ca492953012429", size = 124585 },
+    { url = "https://files.pythonhosted.org/packages/87/56/e6ee5459894c7e554b57ba88f7257dc3c3d2d379cb15baaa1e265b8c6165/multidict-6.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10524ebd769727ac77ef2278390fb0068d83f3acb7773792a5080f2b0abf7748", size = 116957 },
+    { url = "https://files.pythonhosted.org/packages/36/9e/616ce5e8d375c24b84f14fc263c7ef1d8d5e8ef529dbc0f1df8ce71bb5b8/multidict-6.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ff3827aef427c89a25cc96ded1759271a93603aba9fb977a6d264648ebf989db", size = 128609 },
+    { url = "https://files.pythonhosted.org/packages/8c/4f/4783e48a38495d000f2124020dc96bacc806a4340345211b1ab6175a6cb4/multidict-6.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:06809f4f0f7ab7ea2cabf9caca7d79c22c0758b58a71f9d32943ae13c7ace056", size = 123016 },
+    { url = "https://files.pythonhosted.org/packages/3e/b3/4950551ab8fc39862ba5e9907dc821f896aa829b4524b4deefd3e12945ab/multidict-6.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:f179dee3b863ab1c59580ff60f9d99f632f34ccb38bf67a33ec6b3ecadd0fd76", size = 133542 },
+    { url = "https://files.pythonhosted.org/packages/96/4d/f0ce6ac9914168a2a71df117935bb1f1781916acdecbb43285e225b484b8/multidict-6.1.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:aaed8b0562be4a0876ee3b6946f6869b7bcdb571a5d1496683505944e268b160", size = 130163 },
+    { url = "https://files.pythonhosted.org/packages/be/72/17c9f67e7542a49dd252c5ae50248607dfb780bcc03035907dafefb067e3/multidict-6.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3c8b88a2ccf5493b6c8da9076fb151ba106960a2df90c2633f342f120751a9e7", size = 126832 },
+    { url = "https://files.pythonhosted.org/packages/71/9f/72d719e248cbd755c8736c6d14780533a1606ffb3fbb0fbd77da9f0372da/multidict-6.1.0-cp310-cp310-win32.whl", hash = "sha256:4a9cb68166a34117d6646c0023c7b759bf197bee5ad4272f420a0141d7eb03a0", size = 26402 },
+    { url = "https://files.pythonhosted.org/packages/04/5a/d88cd5d00a184e1ddffc82aa2e6e915164a6d2641ed3606e766b5d2f275a/multidict-6.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:20b9b5fbe0b88d0bdef2012ef7dee867f874b72528cf1d08f1d59b0e3850129d", size = 28800 },
+    { url = "https://files.pythonhosted.org/packages/93/13/df3505a46d0cd08428e4c8169a196131d1b0c4b515c3649829258843dde6/multidict-6.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3efe2c2cb5763f2f1b275ad2bf7a287d3f7ebbef35648a9726e3b69284a4f3d6", size = 48570 },
+    { url = "https://files.pythonhosted.org/packages/f0/e1/a215908bfae1343cdb72f805366592bdd60487b4232d039c437fe8f5013d/multidict-6.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c7053d3b0353a8b9de430a4f4b4268ac9a4fb3481af37dfe49825bf45ca24156", size = 29316 },
+    { url = "https://files.pythonhosted.org/packages/70/0f/6dc70ddf5d442702ed74f298d69977f904960b82368532c88e854b79f72b/multidict-6.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:27e5fc84ccef8dfaabb09d82b7d179c7cf1a3fbc8a966f8274fcb4ab2eb4cadb", size = 29640 },
+    { url = "https://files.pythonhosted.org/packages/d8/6d/9c87b73a13d1cdea30b321ef4b3824449866bd7f7127eceed066ccb9b9ff/multidict-6.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e2b90b43e696f25c62656389d32236e049568b39320e2735d51f08fd362761b", size = 131067 },
+    { url = "https://files.pythonhosted.org/packages/cc/1e/1b34154fef373371fd6c65125b3d42ff5f56c7ccc6bfff91b9b3c60ae9e0/multidict-6.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d83a047959d38a7ff552ff94be767b7fd79b831ad1cd9920662db05fec24fe72", size = 138507 },
+    { url = "https://files.pythonhosted.org/packages/fb/e0/0bc6b2bac6e461822b5f575eae85da6aae76d0e2a79b6665d6206b8e2e48/multidict-6.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1a9dd711d0877a1ece3d2e4fea11a8e75741ca21954c919406b44e7cf971304", size = 133905 },
+    { url = "https://files.pythonhosted.org/packages/ba/af/73d13b918071ff9b2205fcf773d316e0f8fefb4ec65354bbcf0b10908cc6/multidict-6.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec2abea24d98246b94913b76a125e855eb5c434f7c46546046372fe60f666351", size = 129004 },
+    { url = "https://files.pythonhosted.org/packages/74/21/23960627b00ed39643302d81bcda44c9444ebcdc04ee5bedd0757513f259/multidict-6.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4867cafcbc6585e4b678876c489b9273b13e9fff9f6d6d66add5e15d11d926cb", size = 121308 },
+    { url = "https://files.pythonhosted.org/packages/8b/5c/cf282263ffce4a596ed0bb2aa1a1dddfe1996d6a62d08842a8d4b33dca13/multidict-6.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5b48204e8d955c47c55b72779802b219a39acc3ee3d0116d5080c388970b76e3", size = 132608 },
+    { url = "https://files.pythonhosted.org/packages/d7/3e/97e778c041c72063f42b290888daff008d3ab1427f5b09b714f5a8eff294/multidict-6.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:d8fff389528cad1618fb4b26b95550327495462cd745d879a8c7c2115248e399", size = 127029 },
+    { url = "https://files.pythonhosted.org/packages/47/ac/3efb7bfe2f3aefcf8d103e9a7162572f01936155ab2f7ebcc7c255a23212/multidict-6.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a7a9541cd308eed5e30318430a9c74d2132e9a8cb46b901326272d780bf2d423", size = 137594 },
+    { url = "https://files.pythonhosted.org/packages/42/9b/6c6e9e8dc4f915fc90a9b7798c44a30773dea2995fdcb619870e705afe2b/multidict-6.1.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:da1758c76f50c39a2efd5e9859ce7d776317eb1dd34317c8152ac9251fc574a3", size = 134556 },
+    { url = "https://files.pythonhosted.org/packages/1d/10/8e881743b26aaf718379a14ac58572a240e8293a1c9d68e1418fb11c0f90/multidict-6.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c943a53e9186688b45b323602298ab727d8865d8c9ee0b17f8d62d14b56f0753", size = 130993 },
+    { url = "https://files.pythonhosted.org/packages/45/84/3eb91b4b557442802d058a7579e864b329968c8d0ea57d907e7023c677f2/multidict-6.1.0-cp311-cp311-win32.whl", hash = "sha256:90f8717cb649eea3504091e640a1b8568faad18bd4b9fcd692853a04475a4b80", size = 26405 },
+    { url = "https://files.pythonhosted.org/packages/9f/0b/ad879847ecbf6d27e90a6eabb7eff6b62c129eefe617ea45eae7c1f0aead/multidict-6.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:82176036e65644a6cc5bd619f65f6f19781e8ec2e5330f51aa9ada7504cc1926", size = 28795 },
+    { url = "https://files.pythonhosted.org/packages/fd/16/92057c74ba3b96d5e211b553895cd6dc7cc4d1e43d9ab8fafc727681ef71/multidict-6.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b04772ed465fa3cc947db808fa306d79b43e896beb677a56fb2347ca1a49c1fa", size = 48713 },
+    { url = "https://files.pythonhosted.org/packages/94/3d/37d1b8893ae79716179540b89fc6a0ee56b4a65fcc0d63535c6f5d96f217/multidict-6.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6180c0ae073bddeb5a97a38c03f30c233e0a4d39cd86166251617d1bbd0af436", size = 29516 },
+    { url = "https://files.pythonhosted.org/packages/a2/12/adb6b3200c363062f805275b4c1e656be2b3681aada66c80129932ff0bae/multidict-6.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:071120490b47aa997cca00666923a83f02c7fbb44f71cf7f136df753f7fa8761", size = 29557 },
+    { url = "https://files.pythonhosted.org/packages/47/e9/604bb05e6e5bce1e6a5cf80a474e0f072e80d8ac105f1b994a53e0b28c42/multidict-6.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50b3a2710631848991d0bf7de077502e8994c804bb805aeb2925a981de58ec2e", size = 130170 },
+    { url = "https://files.pythonhosted.org/packages/7e/13/9efa50801785eccbf7086b3c83b71a4fb501a4d43549c2f2f80b8787d69f/multidict-6.1.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b58c621844d55e71c1b7f7c498ce5aa6985d743a1a59034c57a905b3f153c1ef", size = 134836 },
+    { url = "https://files.pythonhosted.org/packages/bf/0f/93808b765192780d117814a6dfcc2e75de6dcc610009ad408b8814dca3ba/multidict-6.1.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55b6d90641869892caa9ca42ff913f7ff1c5ece06474fbd32fb2cf6834726c95", size = 133475 },
+    { url = "https://files.pythonhosted.org/packages/d3/c8/529101d7176fe7dfe1d99604e48d69c5dfdcadb4f06561f465c8ef12b4df/multidict-6.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b820514bfc0b98a30e3d85462084779900347e4d49267f747ff54060cc33925", size = 131049 },
+    { url = "https://files.pythonhosted.org/packages/ca/0c/fc85b439014d5a58063e19c3a158a889deec399d47b5269a0f3b6a2e28bc/multidict-6.1.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10a9b09aba0c5b48c53761b7c720aaaf7cf236d5fe394cd399c7ba662d5f9966", size = 120370 },
+    { url = "https://files.pythonhosted.org/packages/db/46/d4416eb20176492d2258fbd47b4abe729ff3b6e9c829ea4236f93c865089/multidict-6.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1e16bf3e5fc9f44632affb159d30a437bfe286ce9e02754759be5536b169b305", size = 125178 },
+    { url = "https://files.pythonhosted.org/packages/5b/46/73697ad7ec521df7de5531a32780bbfd908ded0643cbe457f981a701457c/multidict-6.1.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:76f364861c3bfc98cbbcbd402d83454ed9e01a5224bb3a28bf70002a230f73e2", size = 119567 },
+    { url = "https://files.pythonhosted.org/packages/cd/ed/51f060e2cb0e7635329fa6ff930aa5cffa17f4c7f5c6c3ddc3500708e2f2/multidict-6.1.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:820c661588bd01a0aa62a1283f20d2be4281b086f80dad9e955e690c75fb54a2", size = 129822 },
+    { url = "https://files.pythonhosted.org/packages/df/9e/ee7d1954b1331da3eddea0c4e08d9142da5f14b1321c7301f5014f49d492/multidict-6.1.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:0e5f362e895bc5b9e67fe6e4ded2492d8124bdf817827f33c5b46c2fe3ffaca6", size = 128656 },
+    { url = "https://files.pythonhosted.org/packages/77/00/8538f11e3356b5d95fa4b024aa566cde7a38aa7a5f08f4912b32a037c5dc/multidict-6.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ec660d19bbc671e3a6443325f07263be452c453ac9e512f5eb935e7d4ac28b3", size = 125360 },
+    { url = "https://files.pythonhosted.org/packages/be/05/5d334c1f2462d43fec2363cd00b1c44c93a78c3925d952e9a71caf662e96/multidict-6.1.0-cp312-cp312-win32.whl", hash = "sha256:58130ecf8f7b8112cdb841486404f1282b9c86ccb30d3519faf301b2e5659133", size = 26382 },
+    { url = "https://files.pythonhosted.org/packages/a3/bf/f332a13486b1ed0496d624bcc7e8357bb8053823e8cd4b9a18edc1d97e73/multidict-6.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:188215fc0aafb8e03341995e7c4797860181562380f81ed0a87ff455b70bf1f1", size = 28529 },
+    { url = "https://files.pythonhosted.org/packages/22/67/1c7c0f39fe069aa4e5d794f323be24bf4d33d62d2a348acdb7991f8f30db/multidict-6.1.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d569388c381b24671589335a3be6e1d45546c2988c2ebe30fdcada8457a31008", size = 48771 },
+    { url = "https://files.pythonhosted.org/packages/3c/25/c186ee7b212bdf0df2519eacfb1981a017bda34392c67542c274651daf23/multidict-6.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:052e10d2d37810b99cc170b785945421141bf7bb7d2f8799d431e7db229c385f", size = 29533 },
+    { url = "https://files.pythonhosted.org/packages/67/5e/04575fd837e0958e324ca035b339cea174554f6f641d3fb2b4f2e7ff44a2/multidict-6.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f90c822a402cb865e396a504f9fc8173ef34212a342d92e362ca498cad308e28", size = 29595 },
+    { url = "https://files.pythonhosted.org/packages/d3/b2/e56388f86663810c07cfe4a3c3d87227f3811eeb2d08450b9e5d19d78876/multidict-6.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b225d95519a5bf73860323e633a664b0d85ad3d5bede6d30d95b35d4dfe8805b", size = 130094 },
+    { url = "https://files.pythonhosted.org/packages/6c/ee/30ae9b4186a644d284543d55d491fbd4239b015d36b23fea43b4c94f7052/multidict-6.1.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:23bfd518810af7de1116313ebd9092cb9aa629beb12f6ed631ad53356ed6b86c", size = 134876 },
+    { url = "https://files.pythonhosted.org/packages/84/c7/70461c13ba8ce3c779503c70ec9d0345ae84de04521c1f45a04d5f48943d/multidict-6.1.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c09fcfdccdd0b57867577b719c69e347a436b86cd83747f179dbf0cc0d4c1f3", size = 133500 },
+    { url = "https://files.pythonhosted.org/packages/4a/9f/002af221253f10f99959561123fae676148dd730e2daa2cd053846a58507/multidict-6.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf6bea52ec97e95560af5ae576bdac3aa3aae0b6758c6efa115236d9e07dae44", size = 131099 },
+    { url = "https://files.pythonhosted.org/packages/82/42/d1c7a7301d52af79d88548a97e297f9d99c961ad76bbe6f67442bb77f097/multidict-6.1.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57feec87371dbb3520da6192213c7d6fc892d5589a93db548331954de8248fd2", size = 120403 },
+    { url = "https://files.pythonhosted.org/packages/68/f3/471985c2c7ac707547553e8f37cff5158030d36bdec4414cb825fbaa5327/multidict-6.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0c3f390dc53279cbc8ba976e5f8035eab997829066756d811616b652b00a23a3", size = 125348 },
+    { url = "https://files.pythonhosted.org/packages/67/2c/e6df05c77e0e433c214ec1d21ddd203d9a4770a1f2866a8ca40a545869a0/multidict-6.1.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:59bfeae4b25ec05b34f1956eaa1cb38032282cd4dfabc5056d0a1ec4d696d3aa", size = 119673 },
+    { url = "https://files.pythonhosted.org/packages/c5/cd/bc8608fff06239c9fb333f9db7743a1b2eafe98c2666c9a196e867a3a0a4/multidict-6.1.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b2f59caeaf7632cc633b5cf6fc449372b83bbdf0da4ae04d5be36118e46cc0aa", size = 129927 },
+    { url = "https://files.pythonhosted.org/packages/44/8e/281b69b7bc84fc963a44dc6e0bbcc7150e517b91df368a27834299a526ac/multidict-6.1.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:37bb93b2178e02b7b618893990941900fd25b6b9ac0fa49931a40aecdf083fe4", size = 128711 },
+    { url = "https://files.pythonhosted.org/packages/12/a4/63e7cd38ed29dd9f1881d5119f272c898ca92536cdb53ffe0843197f6c85/multidict-6.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4e9f48f58c2c523d5a06faea47866cd35b32655c46b443f163d08c6d0ddb17d6", size = 125519 },
+    { url = "https://files.pythonhosted.org/packages/38/e0/4f5855037a72cd8a7a2f60a3952d9aa45feedb37ae7831642102604e8a37/multidict-6.1.0-cp313-cp313-win32.whl", hash = "sha256:3a37ffb35399029b45c6cc33640a92bef403c9fd388acce75cdc88f58bd19a81", size = 26426 },
+    { url = "https://files.pythonhosted.org/packages/7e/a5/17ee3a4db1e310b7405f5d25834460073a8ccd86198ce044dfaf69eac073/multidict-6.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:e9aa71e15d9d9beaad2c6b9319edcdc0a49a43ef5c0a4c8265ca9ee7d6c67774", size = 28531 },
+    { url = "https://files.pythonhosted.org/packages/99/b7/b9e70fde2c0f0c9af4cc5277782a89b66d35948ea3369ec9f598358c3ac5/multidict-6.1.0-py3-none-any.whl", hash = "sha256:48e171e52d1c4d33888e529b999e5900356b9ae588c2f09a52dcefb158b27506", size = 10051 },
+]
+
+[[package]]
+name = "multiprocess"
+version = "0.70.16"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "dill" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b5/ae/04f39c5d0d0def03247c2893d6f2b83c136bf3320a2154d7b8858f2ba72d/multiprocess-0.70.16.tar.gz", hash = "sha256:161af703d4652a0e1410be6abccecde4a7ddffd19341be0a7011b94aeb171ac1", size = 1772603 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ef/76/6e712a2623d146d314f17598df5de7224c85c0060ef63fd95cc15a25b3fa/multiprocess-0.70.16-pp310-pypy310_pp73-macosx_10_13_x86_64.whl", hash = "sha256:476887be10e2f59ff183c006af746cb6f1fd0eadcfd4ef49e605cbe2659920ee", size = 134980 },
+    { url = "https://files.pythonhosted.org/packages/0f/ab/1e6e8009e380e22254ff539ebe117861e5bdb3bff1fc977920972237c6c7/multiprocess-0.70.16-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d951bed82c8f73929ac82c61f01a7b5ce8f3e5ef40f5b52553b4f547ce2b08ec", size = 134982 },
+    { url = "https://files.pythonhosted.org/packages/bc/f7/7ec7fddc92e50714ea3745631f79bd9c96424cb2702632521028e57d3a36/multiprocess-0.70.16-py310-none-any.whl", hash = "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02", size = 134824 },
+    { url = "https://files.pythonhosted.org/packages/50/15/b56e50e8debaf439f44befec5b2af11db85f6e0f344c3113ae0be0593a91/multiprocess-0.70.16-py311-none-any.whl", hash = "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a", size = 143519 },
+    { url = "https://files.pythonhosted.org/packages/0a/7d/a988f258104dcd2ccf1ed40fdc97e26c4ac351eeaf81d76e266c52d84e2f/multiprocess-0.70.16-py312-none-any.whl", hash = "sha256:fc0544c531920dde3b00c29863377f87e1632601092ea2daca74e4beb40faa2e", size = 146741 },
+    { url = "https://files.pythonhosted.org/packages/ea/89/38df130f2c799090c978b366cfdf5b96d08de5b29a4a293df7f7429fa50b/multiprocess-0.70.16-py38-none-any.whl", hash = "sha256:a71d82033454891091a226dfc319d0cfa8019a4e888ef9ca910372a446de4435", size = 132628 },
+    { url = "https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl", hash = "sha256:a0bafd3ae1b732eac64be2e72038231c1ba97724b60b09400d68f229fcc2fbf3", size = 133351 },
+]
+
 [[package]]
 name = "mypy-extensions"
 version = "1.0.0"
@@ -1376,19 +1789,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/17/7f/d322a4125405920401450118dbdc52e0384026bd669939484670ce8b2ab9/numpy-2.2.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:783145835458e60fa97afac25d511d00a1eca94d4a8f3ace9fe2043003c678e4", size = 12839607 },
 ]
 
-[[package]]
-name = "ollama"
-version = "0.4.7"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "httpx" },
-    { name = "pydantic" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/b0/6d/dc77539c735bbed5d0c873fb029fb86aa9f0163df169b34152914331c369/ollama-0.4.7.tar.gz", hash = "sha256:891dcbe54f55397d82d289c459de0ea897e103b86a3f1fad0fdb1895922a75ff", size = 12843 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/31/83/c3ffac86906c10184c88c2e916460806b072a2cfe34cdcaf3a0c0e836d39/ollama-0.4.7-py3-none-any.whl", hash = "sha256:85505663cca67a83707be5fb3aeff0ea72e67846cea5985529d8eca4366564a1", size = 13210 },
-]
-
 [[package]]
 name = "openai"
 version = "1.63.2"
@@ -1690,6 +2090,95 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e4/ea/d836f008d33151c7a1f62caf3d8dd782e4d15f6a43897f64480c2b8de2ad/prompt_toolkit-3.0.50-py3-none-any.whl", hash = "sha256:9b6427eb19e479d98acff65196a307c555eb567989e6d88ebbb1b509d9779198", size = 387816 },
 ]
 
+[[package]]
+name = "propcache"
+version = "0.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/92/76/f941e63d55c0293ff7829dd21e7cf1147e90a526756869a9070f287a68c9/propcache-0.3.0.tar.gz", hash = "sha256:a8fd93de4e1d278046345f49e2238cdb298589325849b2645d4a94c53faeffc5", size = 42722 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8d/f0/dc9ec44d2e63c13f816a16398c039329736712440ff82b682dd9a78d2258/propcache-0.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:efa44f64c37cc30c9f05932c740a8b40ce359f51882c70883cc95feac842da4d", size = 79574 },
+    { url = "https://files.pythonhosted.org/packages/99/3a/33a207dfcb3ee1131ea23a2aeb726c3c4994f89546d7eadf8c50627c8b63/propcache-0.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2383a17385d9800b6eb5855c2f05ee550f803878f344f58b6e194de08b96352c", size = 45898 },
+    { url = "https://files.pythonhosted.org/packages/af/68/0bde765c9f5dc02b4466d2838600af38c81b184c26c6d3cd44643ac668e3/propcache-0.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d3e7420211f5a65a54675fd860ea04173cde60a7cc20ccfbafcccd155225f8bc", size = 45418 },
+    { url = "https://files.pythonhosted.org/packages/06/a6/c682669bae41199358e16cc7b1c818f91c5f9e925cc863dabd98ce32716a/propcache-0.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3302c5287e504d23bb0e64d2a921d1eb4a03fb93a0a0aa3b53de059f5a5d737d", size = 205116 },
+    { url = "https://files.pythonhosted.org/packages/fb/ae/82cfb50267d9a1baa0340728eb9e32245a68538fef929d7bb786d01c11a8/propcache-0.3.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7e2e068a83552ddf7a39a99488bcba05ac13454fb205c847674da0352602082f", size = 219405 },
+    { url = "https://files.pythonhosted.org/packages/ab/16/7b6b2bf8c207cfd0e5ca3d41aea397392de9899867ec024f88c94f9ae2ab/propcache-0.3.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2d913d36bdaf368637b4f88d554fb9cb9d53d6920b9c5563846555938d5450bf", size = 217656 },
+    { url = "https://files.pythonhosted.org/packages/f4/eb/41447de61eb5454891658d0fb9b1d7d35d49a4a5dd2e0c86f2c332e8b7e1/propcache-0.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ee1983728964d6070ab443399c476de93d5d741f71e8f6e7880a065f878e0b9", size = 205414 },
+    { url = "https://files.pythonhosted.org/packages/03/b6/9719878f8b5b20d37ee663a40f8dcbf888559e4d3be2ba2fe5c790fc28d2/propcache-0.3.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:36ca5e9a21822cc1746023e88f5c0af6fce3af3b85d4520efb1ce4221bed75cc", size = 195746 },
+    { url = "https://files.pythonhosted.org/packages/bb/ec/b79c3210ba459800d1a8f1afeb81d7b503893555a7b79c24082ff26d3314/propcache-0.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9ecde3671e62eeb99e977f5221abcf40c208f69b5eb986b061ccec317c82ebd0", size = 198651 },
+    { url = "https://files.pythonhosted.org/packages/48/f6/2b0140bc47013e43575973068e72ad51ee9f22f2dad42e6d6e362d715125/propcache-0.3.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:d383bf5e045d7f9d239b38e6acadd7b7fdf6c0087259a84ae3475d18e9a2ae8b", size = 195858 },
+    { url = "https://files.pythonhosted.org/packages/97/3d/2fa19303d87aa21f9a42dcd870d6088a2a776ff5518e394d50412c3679a6/propcache-0.3.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:8cb625bcb5add899cb8ba7bf716ec1d3e8f7cdea9b0713fa99eadf73b6d4986f", size = 197181 },
+    { url = "https://files.pythonhosted.org/packages/09/f3/a2170ffc9fa774c1dfd52294113c0fa6cdc5b71dbfd7129bb9378fdd8b42/propcache-0.3.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:5fa159dcee5dba00c1def3231c249cf261185189205073bde13797e57dd7540a", size = 207411 },
+    { url = "https://files.pythonhosted.org/packages/d6/1e/cb8a6c82178efffa0b00dc463f36cd086f747345585140aeb95d5cb93666/propcache-0.3.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:a7080b0159ce05f179cfac592cda1a82898ca9cd097dacf8ea20ae33474fbb25", size = 210724 },
+    { url = "https://files.pythonhosted.org/packages/2b/72/6e273543337a3e22cf462eb836f065a9830b4d41baeb1f58db2695c934f3/propcache-0.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ed7161bccab7696a473fe7ddb619c1d75963732b37da4618ba12e60899fefe4f", size = 203511 },
+    { url = "https://files.pythonhosted.org/packages/f3/ea/7412c79bcec06597c967d49789f5a1f7fd76a8654908feeaefafb7447c9a/propcache-0.3.0-cp310-cp310-win32.whl", hash = "sha256:bf0d9a171908f32d54f651648c7290397b8792f4303821c42a74e7805bfb813c", size = 40600 },
+    { url = "https://files.pythonhosted.org/packages/a3/42/488c90190491f3e61bd2c2fb0b3d91c1c78778270dde2f0b6633fc9ff723/propcache-0.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:42924dc0c9d73e49908e35bbdec87adedd651ea24c53c29cac103ede0ea1d340", size = 44714 },
+    { url = "https://files.pythonhosted.org/packages/45/c9/cf09ff7e6d09f14149094f7cd50d2dec032b24e61af21fc4540da2b17bfb/propcache-0.3.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9ddd49258610499aab83b4f5b61b32e11fce873586282a0e972e5ab3bcadee51", size = 79568 },
+    { url = "https://files.pythonhosted.org/packages/c8/32/2424d89da88cd81b7d148e0d2b3131461b570a02aa9d84a2e567509adb0d/propcache-0.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2578541776769b500bada3f8a4eeaf944530516b6e90c089aa368266ed70c49e", size = 45895 },
+    { url = "https://files.pythonhosted.org/packages/f6/91/ee5b6aa7aa31754fefcf0c5180e09223cac380ef195c4ddc8c266eb641ea/propcache-0.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d8074c5dd61c8a3e915fa8fc04754fa55cfa5978200d2daa1e2d4294c1f136aa", size = 45427 },
+    { url = "https://files.pythonhosted.org/packages/bf/73/38f0128462b8b616181d8c53bd5d04eac41c50c449b07615c65d56ba0a9b/propcache-0.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b58229a844931bca61b3a20efd2be2a2acb4ad1622fc026504309a6883686fbf", size = 232427 },
+    { url = "https://files.pythonhosted.org/packages/59/82/f3d4e84f4539dcfc9c3d338282b9e915f5b63c921986ecfdf7af2d12f87c/propcache-0.3.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e45377d5d6fefe1677da2a2c07b024a6dac782088e37c0b1efea4cfe2b1be19b", size = 239985 },
+    { url = "https://files.pythonhosted.org/packages/42/e8/029f58cccbae83c9969a7ee7a06558d5b83a93dfc54e0f4f70234bbaea1b/propcache-0.3.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ec5060592d83454e8063e487696ac3783cc48c9a329498bafae0d972bc7816c9", size = 238827 },
+    { url = "https://files.pythonhosted.org/packages/8b/a2/c373561777c0cb9b9e7b9b9a10b9b3a7b6bde75a2535b962231cecc8fdb8/propcache-0.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15010f29fbed80e711db272909a074dc79858c6d28e2915704cfc487a8ac89c6", size = 231348 },
+    { url = "https://files.pythonhosted.org/packages/d7/d2/4673f715beedf6038b485bcd976813149231d9df5bb6196cb69a09c185c9/propcache-0.3.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a254537b9b696ede293bfdbc0a65200e8e4507bc9f37831e2a0318a9b333c85c", size = 220426 },
+    { url = "https://files.pythonhosted.org/packages/e0/f6/1da65f900927bafd4675a16e890618ec7643f2f922bf0e4d84bb38645618/propcache-0.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2b975528998de037dfbc10144b8aed9b8dd5a99ec547f14d1cb7c5665a43f075", size = 220294 },
+    { url = "https://files.pythonhosted.org/packages/ff/86/620451bdc02e91b1712cd71890c17077ee97e2a28493836a87e47b8e70ff/propcache-0.3.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:19d36bb351ad5554ff20f2ae75f88ce205b0748c38b146c75628577020351e3c", size = 212492 },
+    { url = "https://files.pythonhosted.org/packages/6e/1b/e8f86921ed4016da80faf3b8f515f7829decabdbff106736bfff353bceba/propcache-0.3.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:6032231d4a5abd67c7f71168fd64a47b6b451fbcb91c8397c2f7610e67683810", size = 215113 },
+    { url = "https://files.pythonhosted.org/packages/1a/95/a61d86cc49aa0945f6c06f3a4614fc543e311a50558c92861f5e9691a37c/propcache-0.3.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:6985a593417cdbc94c7f9c3403747335e450c1599da1647a5af76539672464d3", size = 228330 },
+    { url = "https://files.pythonhosted.org/packages/8f/7d/10dbae48ff2bb189e92c2b3487a48f3229146a25941ad0d485934d1104d4/propcache-0.3.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:6a1948df1bb1d56b5e7b0553c0fa04fd0e320997ae99689488201f19fa90d2e7", size = 231942 },
+    { url = "https://files.pythonhosted.org/packages/39/ce/82d16aec96c5513ae7db13ab901a65a1e54c915292fb5b2390e33275b61d/propcache-0.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:8319293e85feadbbfe2150a5659dbc2ebc4afdeaf7d98936fb9a2f2ba0d4c35c", size = 223077 },
+    { url = "https://files.pythonhosted.org/packages/c8/e0/cb077e8e7a583c733df7f53327fcbdb92e42be59b976ce60bf1d904a0efe/propcache-0.3.0-cp311-cp311-win32.whl", hash = "sha256:63f26258a163c34542c24808f03d734b338da66ba91f410a703e505c8485791d", size = 40455 },
+    { url = "https://files.pythonhosted.org/packages/d8/35/57abeb6146fe3c19081eeaf3d9d4cfea256f87f1e5101acf80d3332c1820/propcache-0.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:cacea77ef7a2195f04f9279297684955e3d1ae4241092ff0cfcef532bb7a1c32", size = 44705 },
+    { url = "https://files.pythonhosted.org/packages/8d/2c/921f15dc365796ec23975b322b0078eae72995c7b4d49eba554c6a308d70/propcache-0.3.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e53d19c2bf7d0d1e6998a7e693c7e87300dd971808e6618964621ccd0e01fe4e", size = 79867 },
+    { url = "https://files.pythonhosted.org/packages/11/a5/4a6cc1a559d1f2fb57ea22edc4245158cdffae92f7f92afcee2913f84417/propcache-0.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a61a68d630e812b67b5bf097ab84e2cd79b48c792857dc10ba8a223f5b06a2af", size = 46109 },
+    { url = "https://files.pythonhosted.org/packages/e1/6d/28bfd3af3a567ad7d667348e7f46a520bda958229c4d545ba138a044232f/propcache-0.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fb91d20fa2d3b13deea98a690534697742029f4fb83673a3501ae6e3746508b5", size = 45635 },
+    { url = "https://files.pythonhosted.org/packages/73/20/d75b42eaffe5075eac2f4e168f6393d21c664c91225288811d85451b2578/propcache-0.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:67054e47c01b7b349b94ed0840ccae075449503cf1fdd0a1fdd98ab5ddc2667b", size = 242159 },
+    { url = "https://files.pythonhosted.org/packages/a5/fb/4b537dd92f9fd4be68042ec51c9d23885ca5fafe51ec24c58d9401034e5f/propcache-0.3.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:997e7b8f173a391987df40f3b52c423e5850be6f6df0dcfb5376365440b56667", size = 248163 },
+    { url = "https://files.pythonhosted.org/packages/e7/af/8a9db04ac596d531ca0ef7dde518feaadfcdabef7b17d6a5ec59ee3effc2/propcache-0.3.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d663fd71491dde7dfdfc899d13a067a94198e90695b4321084c6e450743b8c7", size = 248794 },
+    { url = "https://files.pythonhosted.org/packages/9d/c4/ecfc988879c0fd9db03228725b662d76cf484b6b46f7e92fee94e4b52490/propcache-0.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8884ba1a0fe7210b775106b25850f5e5a9dc3c840d1ae9924ee6ea2eb3acbfe7", size = 243912 },
+    { url = "https://files.pythonhosted.org/packages/04/a2/298dd27184faa8b7d91cc43488b578db218b3cc85b54d912ed27b8c5597a/propcache-0.3.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa806bbc13eac1ab6291ed21ecd2dd426063ca5417dd507e6be58de20e58dfcf", size = 229402 },
+    { url = "https://files.pythonhosted.org/packages/be/0d/efe7fec316ca92dbf4bc4a9ba49ca889c43ca6d48ab1d6fa99fc94e5bb98/propcache-0.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6f4d7a7c0aff92e8354cceca6fe223973ddf08401047920df0fcb24be2bd5138", size = 226896 },
+    { url = "https://files.pythonhosted.org/packages/60/63/72404380ae1d9c96d96e165aa02c66c2aae6072d067fc4713da5cde96762/propcache-0.3.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:9be90eebc9842a93ef8335291f57b3b7488ac24f70df96a6034a13cb58e6ff86", size = 221447 },
+    { url = "https://files.pythonhosted.org/packages/9d/18/b8392cab6e0964b67a30a8f4dadeaff64dc7022b5a34bb1d004ea99646f4/propcache-0.3.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:bf15fc0b45914d9d1b706f7c9c4f66f2b7b053e9517e40123e137e8ca8958b3d", size = 222440 },
+    { url = "https://files.pythonhosted.org/packages/6f/be/105d9ceda0f97eff8c06bac1673448b2db2a497444de3646464d3f5dc881/propcache-0.3.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:5a16167118677d94bb48bfcd91e420088854eb0737b76ec374b91498fb77a70e", size = 234104 },
+    { url = "https://files.pythonhosted.org/packages/cb/c9/f09a4ec394cfcce4053d8b2a04d622b5f22d21ba9bb70edd0cad061fa77b/propcache-0.3.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:41de3da5458edd5678b0f6ff66691507f9885f5fe6a0fb99a5d10d10c0fd2d64", size = 239086 },
+    { url = "https://files.pythonhosted.org/packages/ea/aa/96f7f9ed6def82db67c972bdb7bd9f28b95d7d98f7e2abaf144c284bf609/propcache-0.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:728af36011bb5d344c4fe4af79cfe186729efb649d2f8b395d1572fb088a996c", size = 230991 },
+    { url = "https://files.pythonhosted.org/packages/5a/11/bee5439de1307d06fad176f7143fec906e499c33d7aff863ea8428b8e98b/propcache-0.3.0-cp312-cp312-win32.whl", hash = "sha256:6b5b7fd6ee7b54e01759f2044f936dcf7dea6e7585f35490f7ca0420fe723c0d", size = 40337 },
+    { url = "https://files.pythonhosted.org/packages/e4/17/e5789a54a0455a61cb9efc4ca6071829d992220c2998a27c59aeba749f6f/propcache-0.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:2d15bc27163cd4df433e75f546b9ac31c1ba7b0b128bfb1b90df19082466ff57", size = 44404 },
+    { url = "https://files.pythonhosted.org/packages/3a/0f/a79dd23a0efd6ee01ab0dc9750d8479b343bfd0c73560d59d271eb6a99d4/propcache-0.3.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a2b9bf8c79b660d0ca1ad95e587818c30ccdb11f787657458d6f26a1ea18c568", size = 77287 },
+    { url = "https://files.pythonhosted.org/packages/b8/51/76675703c90de38ac75adb8deceb3f3ad99b67ff02a0fa5d067757971ab8/propcache-0.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b0c1a133d42c6fc1f5fbcf5c91331657a1ff822e87989bf4a6e2e39b818d0ee9", size = 44923 },
+    { url = "https://files.pythonhosted.org/packages/01/9b/fd5ddbee66cf7686e73c516227c2fd9bf471dbfed0f48329d095ea1228d3/propcache-0.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bb2f144c6d98bb5cbc94adeb0447cfd4c0f991341baa68eee3f3b0c9c0e83767", size = 44325 },
+    { url = "https://files.pythonhosted.org/packages/13/1c/6961f11eb215a683b34b903b82bde486c606516c1466bf1fa67f26906d51/propcache-0.3.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1323cd04d6e92150bcc79d0174ce347ed4b349d748b9358fd2e497b121e03c8", size = 225116 },
+    { url = "https://files.pythonhosted.org/packages/ef/ea/f8410c40abcb2e40dffe9adeed017898c930974650a63e5c79b886aa9f73/propcache-0.3.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b812b3cb6caacd072276ac0492d249f210006c57726b6484a1e1805b3cfeea0", size = 229905 },
+    { url = "https://files.pythonhosted.org/packages/ef/5a/a9bf90894001468bf8e6ea293bb00626cc9ef10f8eb7996e9ec29345c7ed/propcache-0.3.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:742840d1d0438eb7ea4280f3347598f507a199a35a08294afdcc560c3739989d", size = 233221 },
+    { url = "https://files.pythonhosted.org/packages/dd/ce/fffdddd9725b690b01d345c1156b4c2cc6dca09ab5c23a6d07b8f37d6e2f/propcache-0.3.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c6e7e4f9167fddc438cd653d826f2222222564daed4116a02a184b464d3ef05", size = 227627 },
+    { url = "https://files.pythonhosted.org/packages/58/ae/45c89a5994a334735a3032b48e8e4a98c05d9536ddee0719913dc27da548/propcache-0.3.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a94ffc66738da99232ddffcf7910e0f69e2bbe3a0802e54426dbf0714e1c2ffe", size = 214217 },
+    { url = "https://files.pythonhosted.org/packages/01/84/bc60188c3290ff8f5f4a92b9ca2d93a62e449c8daf6fd11ad517ad136926/propcache-0.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:3c6ec957025bf32b15cbc6b67afe233c65b30005e4c55fe5768e4bb518d712f1", size = 212921 },
+    { url = "https://files.pythonhosted.org/packages/14/b3/39d60224048feef7a96edabb8217dc3f75415457e5ebbef6814f8b2a27b5/propcache-0.3.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:549722908de62aa0b47a78b90531c022fa6e139f9166be634f667ff45632cc92", size = 208200 },
+    { url = "https://files.pythonhosted.org/packages/9d/b3/0a6720b86791251273fff8a01bc8e628bc70903513bd456f86cde1e1ef84/propcache-0.3.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:5d62c4f6706bff5d8a52fd51fec6069bef69e7202ed481486c0bc3874912c787", size = 208400 },
+    { url = "https://files.pythonhosted.org/packages/e9/4f/bb470f3e687790547e2e78105fb411f54e0cdde0d74106ccadd2521c6572/propcache-0.3.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:24c04f8fbf60094c531667b8207acbae54146661657a1b1be6d3ca7773b7a545", size = 218116 },
+    { url = "https://files.pythonhosted.org/packages/34/71/277f7f9add469698ac9724c199bfe06f85b199542121a71f65a80423d62a/propcache-0.3.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:7c5f5290799a3f6539cc5e6f474c3e5c5fbeba74a5e1e5be75587746a940d51e", size = 222911 },
+    { url = "https://files.pythonhosted.org/packages/92/e3/a7b9782aef5a2fc765b1d97da9ec7aed2f25a4e985703608e73232205e3f/propcache-0.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4fa0e7c9c3cf7c276d4f6ab9af8adddc127d04e0fcabede315904d2ff76db626", size = 216563 },
+    { url = "https://files.pythonhosted.org/packages/ab/76/0583ca2c551aa08ffcff87b2c6849c8f01c1f6fb815a5226f0c5c202173e/propcache-0.3.0-cp313-cp313-win32.whl", hash = "sha256:ee0bd3a7b2e184e88d25c9baa6a9dc609ba25b76daae942edfb14499ac7ec374", size = 39763 },
+    { url = "https://files.pythonhosted.org/packages/80/ec/c6a84f9a36f608379b95f0e786c111d5465926f8c62f12be8cdadb02b15c/propcache-0.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:1c8f7d896a16da9455f882870a507567d4f58c53504dc2d4b1e1d386dfe4588a", size = 43650 },
+    { url = "https://files.pythonhosted.org/packages/ee/95/7d32e3560f5bf83fc2f2a4c1b0c181d327d53d5f85ebd045ab89d4d97763/propcache-0.3.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:e560fd75aaf3e5693b91bcaddd8b314f4d57e99aef8a6c6dc692f935cc1e6bbf", size = 82140 },
+    { url = "https://files.pythonhosted.org/packages/86/89/752388f12e6027a5e63f5d075f15291ded48e2d8311314fff039da5a9b11/propcache-0.3.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:65a37714b8ad9aba5780325228598a5b16c47ba0f8aeb3dc0514701e4413d7c0", size = 47296 },
+    { url = "https://files.pythonhosted.org/packages/1b/4c/b55c98d586c69180d3048984a57a5ea238bdeeccf82dbfcd598e935e10bb/propcache-0.3.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:07700939b2cbd67bfb3b76a12e1412405d71019df00ca5697ce75e5ef789d829", size = 46724 },
+    { url = "https://files.pythonhosted.org/packages/0f/b6/67451a437aed90c4e951e320b5b3d7eb584ade1d5592f6e5e8f678030989/propcache-0.3.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c0fdbdf6983526e269e5a8d53b7ae3622dd6998468821d660d0daf72779aefa", size = 291499 },
+    { url = "https://files.pythonhosted.org/packages/ee/ff/e4179facd21515b24737e1e26e02615dfb5ed29416eed4cf5bc6ac5ce5fb/propcache-0.3.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:794c3dd744fad478b6232289c866c25406ecdfc47e294618bdf1697e69bd64a6", size = 293911 },
+    { url = "https://files.pythonhosted.org/packages/76/8d/94a8585992a064a23bd54f56c5e58c3b8bf0c0a06ae10e56f2353ae16c3d/propcache-0.3.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4544699674faf66fb6b4473a1518ae4999c1b614f0b8297b1cef96bac25381db", size = 293301 },
+    { url = "https://files.pythonhosted.org/packages/b0/b8/2c860c92b4134f68c7716c6f30a0d723973f881c32a6d7a24c4ddca05fdf/propcache-0.3.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fddb8870bdb83456a489ab67c6b3040a8d5a55069aa6f72f9d872235fbc52f54", size = 281947 },
+    { url = "https://files.pythonhosted.org/packages/cd/72/b564be7411b525d11757b713c757c21cd4dc13b6569c3b2b8f6d3c96fd5e/propcache-0.3.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f857034dc68d5ceb30fb60afb6ff2103087aea10a01b613985610e007053a121", size = 268072 },
+    { url = "https://files.pythonhosted.org/packages/37/68/d94649e399e8d7fc051e5a4f2334efc567993525af083db145a70690a121/propcache-0.3.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:02df07041e0820cacc8f739510078f2aadcfd3fc57eaeeb16d5ded85c872c89e", size = 275190 },
+    { url = "https://files.pythonhosted.org/packages/d8/3c/446e125f5bbbc1922964dd67cb541c01cdb678d811297b79a4ff6accc843/propcache-0.3.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:f47d52fd9b2ac418c4890aad2f6d21a6b96183c98021f0a48497a904199f006e", size = 254145 },
+    { url = "https://files.pythonhosted.org/packages/f4/80/fd3f741483dc8e59f7ba7e05eaa0f4e11677d7db2077522b92ff80117a2a/propcache-0.3.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:9ff4e9ecb6e4b363430edf2c6e50173a63e0820e549918adef70515f87ced19a", size = 257163 },
+    { url = "https://files.pythonhosted.org/packages/dc/cf/6292b5ce6ed0017e6a89024a827292122cc41b6259b30ada0c6732288513/propcache-0.3.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:ecc2920630283e0783c22e2ac94427f8cca29a04cfdf331467d4f661f4072dac", size = 280249 },
+    { url = "https://files.pythonhosted.org/packages/e8/f0/fd9b8247b449fe02a4f96538b979997e229af516d7462b006392badc59a1/propcache-0.3.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:c441c841e82c5ba7a85ad25986014be8d7849c3cfbdb6004541873505929a74e", size = 288741 },
+    { url = "https://files.pythonhosted.org/packages/64/71/cf831fdc2617f86cfd7f414cfc487d018e722dac8acc098366ce9bba0941/propcache-0.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6c929916cbdb540d3407c66f19f73387f43e7c12fa318a66f64ac99da601bcdf", size = 277061 },
+    { url = "https://files.pythonhosted.org/packages/42/78/9432542a35d944abeca9e02927a0de38cd7a298466d8ffa171536e2381c3/propcache-0.3.0-cp313-cp313t-win32.whl", hash = "sha256:0c3e893c4464ebd751b44ae76c12c5f5c1e4f6cbd6fbf67e3783cd93ad221863", size = 42252 },
+    { url = "https://files.pythonhosted.org/packages/6f/45/960365f4f8978f48ebb56b1127adf33a49f2e69ecd46ac1f46d6cf78a79d/propcache-0.3.0-cp313-cp313t-win_amd64.whl", hash = "sha256:75e872573220d1ee2305b35c9813626e620768248425f58798413e9c39741f46", size = 46425 },
+    { url = "https://files.pythonhosted.org/packages/b5/35/6c4c6fc8774a9e3629cd750dc24a7a4fb090a25ccd5c3246d127b70f9e22/propcache-0.3.0-py3-none-any.whl", hash = "sha256:67dda3c7325691c2081510e92c561f465ba61b975f481735aefdfc845d2cd043", size = 12101 },
+]
+
 [[package]]
 name = "protobuf"
 version = "5.29.3"
@@ -1749,6 +2238,48 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/69/c1/ec1930bc6c01754b8baf3c99420f340b920561f0060bccbf81809db354cc/pyaml-25.1.0-py3-none-any.whl", hash = "sha256:f7b40629d2dae88035657c860f539db3525ddd0120a11e0bcb44d47d5968b3bc", size = 26074 },
 ]
 
+[[package]]
+name = "pyarrow"
+version = "19.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7f/09/a9046344212690f0632b9c709f9bf18506522feb333c894d0de81d62341a/pyarrow-19.0.1.tar.gz", hash = "sha256:3bf266b485df66a400f282ac0b6d1b500b9d2ae73314a153dbe97d6d5cc8a99e", size = 1129437 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/36/01/b23b514d86b839956238d3f8ef206fd2728eee87ff1b8ce150a5678d9721/pyarrow-19.0.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:fc28912a2dc924dddc2087679cc8b7263accc71b9ff025a1362b004711661a69", size = 30688914 },
+    { url = "https://files.pythonhosted.org/packages/c6/68/218ff7cf4a0652a933e5f2ed11274f724dd43b9813cb18dd72c0a35226a2/pyarrow-19.0.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:fca15aabbe9b8355800d923cc2e82c8ef514af321e18b437c3d782aa884eaeec", size = 32102866 },
+    { url = "https://files.pythonhosted.org/packages/98/01/c295050d183014f4a2eb796d7d2bbfa04b6cccde7258bb68aacf6f18779b/pyarrow-19.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad76aef7f5f7e4a757fddcdcf010a8290958f09e3470ea458c80d26f4316ae89", size = 41147682 },
+    { url = "https://files.pythonhosted.org/packages/40/17/a6c3db0b5f3678f33bbb552d2acbc16def67f89a72955b67b0109af23eb0/pyarrow-19.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d03c9d6f2a3dffbd62671ca070f13fc527bb1867b4ec2b98c7eeed381d4f389a", size = 42179192 },
+    { url = "https://files.pythonhosted.org/packages/cf/75/c7c8e599300d8cebb6cb339014800e1c720c9db2a3fcb66aa64ec84bac72/pyarrow-19.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:65cf9feebab489b19cdfcfe4aa82f62147218558d8d3f0fc1e9dea0ab8e7905a", size = 40517272 },
+    { url = "https://files.pythonhosted.org/packages/ef/c9/68ab123ee1528699c4d5055f645ecd1dd68ff93e4699527249d02f55afeb/pyarrow-19.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:41f9706fbe505e0abc10e84bf3a906a1338905cbbcf1177b71486b03e6ea6608", size = 42069036 },
+    { url = "https://files.pythonhosted.org/packages/54/e3/d5cfd7654084e6c0d9c3ce949e5d9e0ccad569ae1e2d5a68a3ec03b2be89/pyarrow-19.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:c6cb2335a411b713fdf1e82a752162f72d4a7b5dbc588e32aa18383318b05866", size = 25277951 },
+    { url = "https://files.pythonhosted.org/packages/a0/55/f1a8d838ec07fe3ca53edbe76f782df7b9aafd4417080eebf0b42aab0c52/pyarrow-19.0.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:cc55d71898ea30dc95900297d191377caba257612f384207fe9f8293b5850f90", size = 30713987 },
+    { url = "https://files.pythonhosted.org/packages/13/12/428861540bb54c98a140ae858a11f71d041ef9e501e6b7eb965ca7909505/pyarrow-19.0.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:7a544ec12de66769612b2d6988c36adc96fb9767ecc8ee0a4d270b10b1c51e00", size = 32135613 },
+    { url = "https://files.pythonhosted.org/packages/2f/8a/23d7cc5ae2066c6c736bce1db8ea7bc9ac3ef97ac7e1c1667706c764d2d9/pyarrow-19.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0148bb4fc158bfbc3d6dfe5001d93ebeed253793fff4435167f6ce1dc4bddeae", size = 41149147 },
+    { url = "https://files.pythonhosted.org/packages/a2/7a/845d151bb81a892dfb368bf11db584cf8b216963ccce40a5cf50a2492a18/pyarrow-19.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f24faab6ed18f216a37870d8c5623f9c044566d75ec586ef884e13a02a9d62c5", size = 42178045 },
+    { url = "https://files.pythonhosted.org/packages/a7/31/e7282d79a70816132cf6cae7e378adfccce9ae10352d21c2fecf9d9756dd/pyarrow-19.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:4982f8e2b7afd6dae8608d70ba5bd91699077323f812a0448d8b7abdff6cb5d3", size = 40532998 },
+    { url = "https://files.pythonhosted.org/packages/b8/82/20f3c290d6e705e2ee9c1fa1d5a0869365ee477e1788073d8b548da8b64c/pyarrow-19.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:49a3aecb62c1be1d822f8bf629226d4a96418228a42f5b40835c1f10d42e4db6", size = 42084055 },
+    { url = "https://files.pythonhosted.org/packages/ff/77/e62aebd343238863f2c9f080ad2ef6ace25c919c6ab383436b5b81cbeef7/pyarrow-19.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:008a4009efdb4ea3d2e18f05cd31f9d43c388aad29c636112c2966605ba33466", size = 25283133 },
+    { url = "https://files.pythonhosted.org/packages/78/b4/94e828704b050e723f67d67c3535cf7076c7432cd4cf046e4bb3b96a9c9d/pyarrow-19.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:80b2ad2b193e7d19e81008a96e313fbd53157945c7be9ac65f44f8937a55427b", size = 30670749 },
+    { url = "https://files.pythonhosted.org/packages/7e/3b/4692965e04bb1df55e2c314c4296f1eb12b4f3052d4cf43d29e076aedf66/pyarrow-19.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:ee8dec072569f43835932a3b10c55973593abc00936c202707a4ad06af7cb294", size = 32128007 },
+    { url = "https://files.pythonhosted.org/packages/22/f7/2239af706252c6582a5635c35caa17cb4d401cd74a87821ef702e3888957/pyarrow-19.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d5d1ec7ec5324b98887bdc006f4d2ce534e10e60f7ad995e7875ffa0ff9cb14", size = 41144566 },
+    { url = "https://files.pythonhosted.org/packages/fb/e3/c9661b2b2849cfefddd9fd65b64e093594b231b472de08ff658f76c732b2/pyarrow-19.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3ad4c0eb4e2a9aeb990af6c09e6fa0b195c8c0e7b272ecc8d4d2b6574809d34", size = 42202991 },
+    { url = "https://files.pythonhosted.org/packages/fe/4f/a2c0ed309167ef436674782dfee4a124570ba64299c551e38d3fdaf0a17b/pyarrow-19.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:d383591f3dcbe545f6cc62daaef9c7cdfe0dff0fb9e1c8121101cabe9098cfa6", size = 40507986 },
+    { url = "https://files.pythonhosted.org/packages/27/2e/29bb28a7102a6f71026a9d70d1d61df926887e36ec797f2e6acfd2dd3867/pyarrow-19.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b4c4156a625f1e35d6c0b2132635a237708944eb41df5fbe7d50f20d20c17832", size = 42087026 },
+    { url = "https://files.pythonhosted.org/packages/16/33/2a67c0f783251106aeeee516f4806161e7b481f7d744d0d643d2f30230a5/pyarrow-19.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:5bd1618ae5e5476b7654c7b55a6364ae87686d4724538c24185bbb2952679960", size = 25250108 },
+    { url = "https://files.pythonhosted.org/packages/2b/8d/275c58d4b00781bd36579501a259eacc5c6dfb369be4ddeb672ceb551d2d/pyarrow-19.0.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:e45274b20e524ae5c39d7fc1ca2aa923aab494776d2d4b316b49ec7572ca324c", size = 30653552 },
+    { url = "https://files.pythonhosted.org/packages/a0/9e/e6aca5cc4ef0c7aec5f8db93feb0bde08dbad8c56b9014216205d271101b/pyarrow-19.0.1-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:d9dedeaf19097a143ed6da37f04f4051aba353c95ef507764d344229b2b740ae", size = 32103413 },
+    { url = "https://files.pythonhosted.org/packages/6a/fa/a7033f66e5d4f1308c7eb0dfcd2ccd70f881724eb6fd1776657fdf65458f/pyarrow-19.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ebfb5171bb5f4a52319344ebbbecc731af3f021e49318c74f33d520d31ae0c4", size = 41134869 },
+    { url = "https://files.pythonhosted.org/packages/2d/92/34d2569be8e7abdc9d145c98dc410db0071ac579b92ebc30da35f500d630/pyarrow-19.0.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2a21d39fbdb948857f67eacb5bbaaf36802de044ec36fbef7a1c8f0dd3a4ab2", size = 42192626 },
+    { url = "https://files.pythonhosted.org/packages/0a/1f/80c617b1084fc833804dc3309aa9d8daacd46f9ec8d736df733f15aebe2c/pyarrow-19.0.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:99bc1bec6d234359743b01e70d4310d0ab240c3d6b0da7e2a93663b0158616f6", size = 40496708 },
+    { url = "https://files.pythonhosted.org/packages/e6/90/83698fcecf939a611c8d9a78e38e7fed7792dcc4317e29e72cf8135526fb/pyarrow-19.0.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:1b93ef2c93e77c442c979b0d596af45e4665d8b96da598db145b0fec014b9136", size = 42075728 },
+    { url = "https://files.pythonhosted.org/packages/40/49/2325f5c9e7a1c125c01ba0c509d400b152c972a47958768e4e35e04d13d8/pyarrow-19.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:d9d46e06846a41ba906ab25302cf0fd522f81aa2a85a71021826f34639ad31ef", size = 25242568 },
+    { url = "https://files.pythonhosted.org/packages/3f/72/135088d995a759d4d916ec4824cb19e066585b4909ebad4ab196177aa825/pyarrow-19.0.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:c0fe3dbbf054a00d1f162fda94ce236a899ca01123a798c561ba307ca38af5f0", size = 30702371 },
+    { url = "https://files.pythonhosted.org/packages/2e/01/00beeebd33d6bac701f20816a29d2018eba463616bbc07397fdf99ac4ce3/pyarrow-19.0.1-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:96606c3ba57944d128e8a8399da4812f56c7f61de8c647e3470b417f795d0ef9", size = 32116046 },
+    { url = "https://files.pythonhosted.org/packages/1f/c9/23b1ea718dfe967cbd986d16cf2a31fe59d015874258baae16d7ea0ccabc/pyarrow-19.0.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f04d49a6b64cf24719c080b3c2029a3a5b16417fd5fd7c4041f94233af732f3", size = 41091183 },
+    { url = "https://files.pythonhosted.org/packages/3a/d4/b4a3aa781a2c715520aa8ab4fe2e7fa49d33a1d4e71c8fc6ab7b5de7a3f8/pyarrow-19.0.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a9137cf7e1640dce4c190551ee69d478f7121b5c6f323553b319cac936395f6", size = 42171896 },
+    { url = "https://files.pythonhosted.org/packages/23/1b/716d4cd5a3cbc387c6e6745d2704c4b46654ba2668260d25c402626c5ddb/pyarrow-19.0.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:7c1bca1897c28013db5e4c83944a2ab53231f541b9e0c3f4791206d0c0de389a", size = 40464851 },
+    { url = "https://files.pythonhosted.org/packages/ed/bd/54907846383dcc7ee28772d7e646f6c34276a17da740002a5cefe90f04f7/pyarrow-19.0.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:58d9397b2e273ef76264b45531e9d552d8ec8a6688b7390b5be44c02a37aade8", size = 42085744 },
+]
+
 [[package]]
 name = "pycparser"
 version = "2.22"
@@ -1869,6 +2400,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/63/37/3e32eeb2a451fddaa3898e2163746b0cffbbdbb4740d38372db0490d67f3/pydantic_core-2.27.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:7e17b560be3c98a8e3aa66ce828bdebb9e9ac6ad5466fba92eb74c4c95cb1151", size = 2004715 },
 ]
 
+[[package]]
+name = "pydantic-settings"
+version = "2.8.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pydantic" },
+    { name = "python-dotenv" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/88/82/c79424d7d8c29b994fb01d277da57b0a9b09cc03c3ff875f9bd8a86b2145/pydantic_settings-2.8.1.tar.gz", hash = "sha256:d5c663dfbe9db9d5e1c646b2e161da12f0d734d422ee56f567d0ea2cee4e8585", size = 83550 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0b/53/a64f03044927dc47aafe029c42a5b7aabc38dfb813475e0e1bf71c4a59d0/pydantic_settings-2.8.1-py3-none-any.whl", hash = "sha256:81942d5ac3d905f7f3ee1a70df5dfb62d5569c12f51a5a647defc1c3d9ee2e9c", size = 30839 },
+]
+
 [[package]]
 name = "pygments"
 version = "2.19.1"
@@ -2124,6 +2668,86 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e3/fe/72e7e166bda3885810bee7b23049133e142f7c80c295bae02c562caeea16/pyzmq-26.2.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:bd8fdee945b877aa3bffc6a5a8816deb048dab0544f9df3731ecd0e54d8c84c9", size = 556563 },
 ]
 
+[[package]]
+name = "rapidfuzz"
+version = "3.12.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f9/be/8dff25a6157dfbde9867720b1282157fe7b809e085130bb89d7655c62186/rapidfuzz-3.12.2.tar.gz", hash = "sha256:b0ba1ccc22fff782e7152a3d3d0caca44ec4e32dc48ba01c560b8593965b5aa3", size = 57907839 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/dd/47/55413211ec32f76c39a6e4f88d024d2194fd4c23abe8102cdbcf28cf80eb/rapidfuzz-3.12.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0b9a75e0385a861178adf59e86d6616cbd0d5adca7228dc9eeabf6f62cf5b0b1", size = 1959750 },
+    { url = "https://files.pythonhosted.org/packages/a3/7f/7350c9a68952b52f669b50528b0e53fca2a9d633457fc2a53d8a5e4b1bb2/rapidfuzz-3.12.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6906a7eb458731e3dd2495af1d0410e23a21a2a2b7ced535e6d5cd15cb69afc5", size = 1433727 },
+    { url = "https://files.pythonhosted.org/packages/43/b0/148a34adc92f49582add349faaad9d8f4462a76cc30ad2f1d86bdba4fa44/rapidfuzz-3.12.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4b3334a8958b689f292d5ce8a928140ac98919b51e084f04bf0c14276e4c6ba", size = 1423353 },
+    { url = "https://files.pythonhosted.org/packages/1e/8f/923ca60dcd814dba1772420c38c8b70e1fe4e6f0b5699bb3afcbe8c4bed1/rapidfuzz-3.12.2-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:85a54ce30345cff2c79cbcffa063f270ad1daedd0d0c3ff6e541d3c3ba4288cf", size = 5641810 },
+    { url = "https://files.pythonhosted.org/packages/b8/91/b57ea560a8ff54e0ebb131a62740501ff7f6ffa14dc16e9853a97289614c/rapidfuzz-3.12.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:acb63c5072c08058f8995404201a52fc4e1ecac105548a4d03c6c6934bda45a3", size = 1683536 },
+    { url = "https://files.pythonhosted.org/packages/fd/5b/fba390383a82353b72c32b5d14f0f7669a542e7205c55f6d2ae6112369bf/rapidfuzz-3.12.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5385398d390c6571f0f2a7837e6ddde0c8b912dac096dc8c87208ce9aaaa7570", size = 1685847 },
+    { url = "https://files.pythonhosted.org/packages/15/6f/5211f2e80d4e82ff793f214429cbc8a8a69ef7978fd299112ae1c5595ae8/rapidfuzz-3.12.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5032cbffa245b4beba0067f8ed17392ef2501b346ae3c1f1d14b950edf4b6115", size = 3142196 },
+    { url = "https://files.pythonhosted.org/packages/92/fc/d2b4efecf81180c49da09ff97657e0517a5ea55a99b16a1adc56d2900c0b/rapidfuzz-3.12.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:195adbb384d89d6c55e2fd71e7fb262010f3196e459aa2f3f45f31dd7185fe72", size = 2521222 },
+    { url = "https://files.pythonhosted.org/packages/ef/5f/a27e284d37632c808eb7cd6c49178dd52354bfb290843e253af4bd46fa61/rapidfuzz-3.12.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:f43b773a4d4950606fb25568ecde5f25280daf8f97b87eb323e16ecd8177b328", size = 7867428 },
+    { url = "https://files.pythonhosted.org/packages/45/68/59168dd67d319a958c525a4eeada5d62a83f83a42b79f9b55917da70f1a7/rapidfuzz-3.12.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:55a43be0e0fa956a919043c19d19bd988991d15c59f179d413fe5145ed9deb43", size = 2904044 },
+    { url = "https://files.pythonhosted.org/packages/5e/40/6bbe014b94d3cef718cfe0be41eb0cecf6fda4b1cd31ba1dddf1984afa08/rapidfuzz-3.12.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:71cf1ea16acdebe9e2fb62ee7a77f8f70e877bebcbb33b34e660af2eb6d341d9", size = 3551416 },
+    { url = "https://files.pythonhosted.org/packages/e4/6b/2f8e0f7de4a5ac54258be885c2e735a315c71187481a7f3d655d650c5c4c/rapidfuzz-3.12.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a3692d4ab36d44685f61326dca539975a4eda49b2a76f0a3df177d8a2c0de9d2", size = 4589777 },
+    { url = "https://files.pythonhosted.org/packages/51/b3/84927233624d5e308e4739c748d2cb4ba46675efb7e021661c68b7a7b941/rapidfuzz-3.12.2-cp310-cp310-win32.whl", hash = "sha256:09227bd402caa4397ba1d6e239deea635703b042dd266a4092548661fb22b9c6", size = 1862195 },
+    { url = "https://files.pythonhosted.org/packages/c9/49/e101be3e62b6524ea8b271b2e949878c8b58c31a0dc5d30b90f4f5c980e7/rapidfuzz-3.12.2-cp310-cp310-win_amd64.whl", hash = "sha256:0f05b7b95f9f87254b53fa92048367a8232c26cee7fc8665e4337268c3919def", size = 1625063 },
+    { url = "https://files.pythonhosted.org/packages/ed/21/a7cbb1eacad92a840a62a22f49d98b423154da49874b787e24bb630f4689/rapidfuzz-3.12.2-cp310-cp310-win_arm64.whl", hash = "sha256:6938738e00d9eb6e04097b3f565097e20b0c398f9c58959a2bc64f7f6be3d9da", size = 870054 },
+    { url = "https://files.pythonhosted.org/packages/8e/41/985b8786f7895f7a7f03f80b547e04a5b9f41187f43de386ad2f32b9f9fc/rapidfuzz-3.12.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e9c4d984621ae17404c58f8d06ed8b025e167e52c0e6a511dfec83c37e9220cd", size = 1960568 },
+    { url = "https://files.pythonhosted.org/packages/90/9e/9278b4160bf86346fc5f110b5daf07af629343bfcd04a9366d355bc6104e/rapidfuzz-3.12.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9f9132c55d330f0a1d34ce6730a76805323a6250d97468a1ca766a883d6a9a25", size = 1434362 },
+    { url = "https://files.pythonhosted.org/packages/e7/53/fe3fb50111e203da4e82b8694c29cbf44101cdbf1efd7ef721cdf85e0aca/rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:39b343b6cb4b2c3dbc8d2d4c5ee915b6088e3b144ddf8305a57eaab16cf9fc74", size = 1417839 },
+    { url = "https://files.pythonhosted.org/packages/fd/c4/aa11749bc9d9c0539061d32f2c525d99e11588867d3d6e94693ccd4e0dd0/rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:24081077b571ec4ee6d5d7ea0e49bc6830bf05b50c1005028523b9cd356209f3", size = 5620525 },
+    { url = "https://files.pythonhosted.org/packages/5f/62/463c618a5a8a44bf6b087325353e13dbd5bc19c44cc06134d3c9eff0d04a/rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c988a4fc91856260355773bf9d32bebab2083d4c6df33fafeddf4330e5ae9139", size = 1671267 },
+    { url = "https://files.pythonhosted.org/packages/ca/b6/ec87c56ed0fab59f8220f5b832d5c1dd374667bee73318a01392ccc8c23d/rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:780b4469ee21cf62b1b2e8ada042941fd2525e45d5fb6a6901a9798a0e41153c", size = 1683415 },
+    { url = "https://files.pythonhosted.org/packages/46/08/862e65a1022cbfa2935e7b3f04cdaa73b0967ebf4762ddf509735da47d73/rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:edd84b0a323885493c893bad16098c5e3b3005d7caa995ae653da07373665d97", size = 3139234 },
+    { url = "https://files.pythonhosted.org/packages/ee/fa/7e8c0d1d26a4b892344c743f17e2c8482f749b616cd651590bd60994b49f/rapidfuzz-3.12.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:efa22059c765b3d8778083805b199deaaf643db070f65426f87d274565ddf36a", size = 2523730 },
+    { url = "https://files.pythonhosted.org/packages/8a/52/1d5b80e990c2e9998e47be118c2dbabda75daa2a5f5ff978df1ed76d7f81/rapidfuzz-3.12.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:095776b11bb45daf7c2973dd61cc472d7ea7f2eecfa454aef940b4675659b92f", size = 7880525 },
+    { url = "https://files.pythonhosted.org/packages/0c/18/9c8cd7378272590a1eb0855b587f3a1fbd3492bd1612825d675320eeeb1b/rapidfuzz-3.12.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:7e2574cf4aa86065600b664a1ac7b8b8499107d102ecde836aaaa403fc4f1784", size = 2905180 },
+    { url = "https://files.pythonhosted.org/packages/4b/94/992de5d0fc9269a93ce62979aced028e0939d3477ea99d87fd0e22f44e8d/rapidfuzz-3.12.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:d5a3425a6c50fd8fbd991d8f085ddb504791dae6ef9cc3ab299fea2cb5374bef", size = 3548613 },
+    { url = "https://files.pythonhosted.org/packages/9b/25/ed3a0317f118131ee297de5936e1587e48b059e6438f4bbf92ef3bbc4927/rapidfuzz-3.12.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:97fb05e1ddb7b71a054040af588b0634214ee87cea87900d309fafc16fd272a4", size = 4583047 },
+    { url = "https://files.pythonhosted.org/packages/4d/27/10585a5a62ff6ebbefa3e836a3fd8c123e2ed0bbde8cfcdd7477032cd458/rapidfuzz-3.12.2-cp311-cp311-win32.whl", hash = "sha256:b4c5a0413589aef936892fbfa94b7ff6f7dd09edf19b5a7b83896cc9d4e8c184", size = 1863208 },
+    { url = "https://files.pythonhosted.org/packages/38/4c/faacecf70a4e202a02f029ec6f6e04e910d95c4ef36d7d63b83b160f7f3e/rapidfuzz-3.12.2-cp311-cp311-win_amd64.whl", hash = "sha256:58d9ae5cf9246d102db2a2558b67fe7e73c533e5d769099747921232d88b9be2", size = 1630876 },
+    { url = "https://files.pythonhosted.org/packages/a7/4b/4931da26e0677880a9a533ef75ccbe564c091aa4a3579aed0355c7e06900/rapidfuzz-3.12.2-cp311-cp311-win_arm64.whl", hash = "sha256:7635fe34246cd241c8e35eb83084e978b01b83d5ef7e5bf72a704c637f270017", size = 870896 },
+    { url = "https://files.pythonhosted.org/packages/a7/d2/e071753227c9e9f7f3550b983f30565f6e994581529815fa5a8879e7cd10/rapidfuzz-3.12.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1d982a651253ffe8434d9934ff0c1089111d60502228464721a2a4587435e159", size = 1944403 },
+    { url = "https://files.pythonhosted.org/packages/aa/d1/4a10d21cc97aa36f4019af24382b5b4dc5ea6444499883c1c1286c6089ba/rapidfuzz-3.12.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:02e6466caa0222d5233b1f05640873671cd99549a5c5ba4c29151634a1e56080", size = 1430287 },
+    { url = "https://files.pythonhosted.org/packages/6a/2d/76d39ab0beeb884d432096fe288c41850e37608e0145264081d0cb809f3c/rapidfuzz-3.12.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e956b3f053e474abae69ac693a52742109d860ac2375fe88e9387d3277f4c96c", size = 1403693 },
+    { url = "https://files.pythonhosted.org/packages/85/1a/719b0f6498c003627e4b83b841bdcd48b11de8a9908a9051c4d2a0bc2245/rapidfuzz-3.12.2-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2dee7d740a2d5418d4f964f39ab8d89923e6b945850db833e798a1969b19542a", size = 5555878 },
+    { url = "https://files.pythonhosted.org/packages/af/48/14d952a73254b4b0e517141acd27979bd23948adaf197f6ca2dc722fde6a/rapidfuzz-3.12.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a057cdb0401e42c84b6516c9b1635f7aedd5e430c6e388bd5f6bcd1d6a0686bb", size = 1655301 },
+    { url = "https://files.pythonhosted.org/packages/db/3f/b093e154e9752325d7459aa6dca43b7acbcaffa05133507e2403676e3e75/rapidfuzz-3.12.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dccf8d4fb5b86d39c581a59463c596b1d09df976da26ff04ae219604223d502f", size = 1678069 },
+    { url = "https://files.pythonhosted.org/packages/d6/7e/88853ecae5b5456eb1a1d8a01cbd534e25b671735d5d974609cbae082542/rapidfuzz-3.12.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21d5b3793c6f5aecca595cd24164bf9d3c559e315ec684f912146fc4e769e367", size = 3137119 },
+    { url = "https://files.pythonhosted.org/packages/4d/d2/b1f809b815aaf682ddac9c57929149f740b90feeb4f8da2f535c196de821/rapidfuzz-3.12.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:46a616c0e13cff2de1761b011e0b14bb73b110182f009223f1453d505c9a975c", size = 2491639 },
+    { url = "https://files.pythonhosted.org/packages/61/e4/a908d7b8db6e52ba2f80f6f0d0709ef9fdedb767db4307084331742b67f0/rapidfuzz-3.12.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:19fa5bc4301a1ee55400d4a38a8ecf9522b0391fc31e6da5f4d68513fe5c0026", size = 7821561 },
+    { url = "https://files.pythonhosted.org/packages/f3/83/0250c49deefff15c46f5e590d8ee6abbd0f056e20b85994db55c16ac6ead/rapidfuzz-3.12.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:544a47190a0d25971658a9365dba7095397b4ce3e897f7dd0a77ca2cf6fa984e", size = 2874048 },
+    { url = "https://files.pythonhosted.org/packages/6c/3f/8d433d964c6e476476ee53eae5fa77b9f16b38d312eb1571e9099a6a3b12/rapidfuzz-3.12.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:f21af27c5e001f0ba1b88c36a0936437dfe034c452548d998891c21125eb640f", size = 3522801 },
+    { url = "https://files.pythonhosted.org/packages/82/85/4931bfa41ef837b1544838e46e0556640d18114b3da9cf05e10defff00ae/rapidfuzz-3.12.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b63170d9db00629b5b3f2862114d8d6ee19127eaba0eee43762d62a25817dbe0", size = 4567304 },
+    { url = "https://files.pythonhosted.org/packages/b1/fe/fdae322869885115dd19a38c1da71b73a8832aa77757c93f460743d4f54c/rapidfuzz-3.12.2-cp312-cp312-win32.whl", hash = "sha256:6c7152d77b2eb6bfac7baa11f2a9c45fd5a2d848dbb310acd0953b3b789d95c9", size = 1845332 },
+    { url = "https://files.pythonhosted.org/packages/ca/a4/2ccebda5fb8a266d163d57a42c2a6ef6f91815df5d89cf38c12e8aa6ed0b/rapidfuzz-3.12.2-cp312-cp312-win_amd64.whl", hash = "sha256:1a314d170ee272ac87579f25a6cf8d16a031e1f7a7b07663434b41a1473bc501", size = 1617926 },
+    { url = "https://files.pythonhosted.org/packages/a5/bc/aa8a4dc4ebff966dd039cce017c614cfd202049b4d1a2daafee7d018521b/rapidfuzz-3.12.2-cp312-cp312-win_arm64.whl", hash = "sha256:d41e8231326e94fd07c4d8f424f6bed08fead6f5e6688d1e6e787f1443ae7631", size = 864737 },
+    { url = "https://files.pythonhosted.org/packages/96/59/2ea3b5bb82798eae73d6ee892264ebfe42727626c1f0e96c77120f0d5cf6/rapidfuzz-3.12.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:941f31038dba5d3dedcfcceba81d61570ad457c873a24ceb13f4f44fcb574260", size = 1936870 },
+    { url = "https://files.pythonhosted.org/packages/54/85/4e486bf9ea05e771ad231731305ed701db1339157f630b76b246ce29cf71/rapidfuzz-3.12.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:fe2dfc454ee51ba168a67b1e92b72aad251e45a074972cef13340bbad2fd9438", size = 1424231 },
+    { url = "https://files.pythonhosted.org/packages/dc/60/aeea3eed402c40a8cf055d554678769fbee0dd95c22f04546070a22bb90e/rapidfuzz-3.12.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78fafaf7f5a48ee35ccd7928339080a0136e27cf97396de45259eca1d331b714", size = 1398055 },
+    { url = "https://files.pythonhosted.org/packages/33/6b/757106f4c21fe3f20ce13ba3df560da60e52fe0dc390fd22bf613761669c/rapidfuzz-3.12.2-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e0c7989ff32c077bb8fd53253fd6ca569d1bfebc80b17557e60750e6909ba4fe", size = 5526188 },
+    { url = "https://files.pythonhosted.org/packages/1e/a2/7c680cdc5532746dba67ecf302eed975252657094e50ae334fa9268352e8/rapidfuzz-3.12.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:96fa00bc105caa34b6cd93dca14a29243a3a7f0c336e4dcd36348d38511e15ac", size = 1648483 },
+    { url = "https://files.pythonhosted.org/packages/f6/b0/ce942a1448b1a75d64af230dd746dede502224dd29ca9001665bbfd4bee6/rapidfuzz-3.12.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bccfb30c668620c5bc3490f2dc7d7da1cca0ead5a9da8b755e2e02e2ef0dff14", size = 1676076 },
+    { url = "https://files.pythonhosted.org/packages/ba/71/81f77b08333200be6984b6cdf2bdfd7cfca4943f16b478a2f7838cba8d66/rapidfuzz-3.12.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f9b0adc3d894beb51f5022f64717b6114a6fabaca83d77e93ac7675911c8cc5", size = 3114169 },
+    { url = "https://files.pythonhosted.org/packages/01/16/f3f34b207fdc8c61a33f9d2d61fc96b62c7dadca88bda1df1be4b94afb0b/rapidfuzz-3.12.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:32691aa59577f42864d5535cb6225d0f47e2c7bff59cf4556e5171e96af68cc1", size = 2485317 },
+    { url = "https://files.pythonhosted.org/packages/b2/a6/b954f0766f644eb8dd8df44703e024ab4f5f15a8f8f5ea969963dd036f50/rapidfuzz-3.12.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:758b10380ad34c1f51753a070d7bb278001b5e6fcf544121c6df93170952d705", size = 7844495 },
+    { url = "https://files.pythonhosted.org/packages/fb/8f/1dc604d05e07150a02b56a8ffc47df75ce316c65467259622c9edf098451/rapidfuzz-3.12.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:50a9c54c0147b468363119132d514c5024fbad1ed8af12bd8bd411b0119f9208", size = 2873242 },
+    { url = "https://files.pythonhosted.org/packages/78/a9/9c649ace4b7f885e0a5fdcd1f33b057ebd83ecc2837693e6659bd944a2bb/rapidfuzz-3.12.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e3ceb87c11d2d0fbe8559bb795b0c0604b84cfc8bb7b8720b5c16e9e31e00f41", size = 3519124 },
+    { url = "https://files.pythonhosted.org/packages/f5/81/ce0b774e540a2e22ec802e383131d7ead18347197304d584c4ccf7b8861a/rapidfuzz-3.12.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f7c9a003002434889255ff5676ca0f8934a478065ab5e702f75dc42639505bba", size = 4557831 },
+    { url = "https://files.pythonhosted.org/packages/13/28/7bf0ee8d35efa7ab14e83d1795cdfd54833aa0428b6f87e987893136c372/rapidfuzz-3.12.2-cp313-cp313-win32.whl", hash = "sha256:cf165a76870cd875567941cf861dfd361a0a6e6a56b936c5d30042ddc9def090", size = 1842802 },
+    { url = "https://files.pythonhosted.org/packages/ef/7e/792d609484776c8a40e1695ebd28b62196be9f8347b785b9104604dc7268/rapidfuzz-3.12.2-cp313-cp313-win_amd64.whl", hash = "sha256:55bcc003541f5f16ec0a73bf6de758161973f9e8d75161954380738dd147f9f2", size = 1615808 },
+    { url = "https://files.pythonhosted.org/packages/4b/43/ca3d1018b392f49131843648e10b08ace23afe8dad3bee5f136e4346b7cd/rapidfuzz-3.12.2-cp313-cp313-win_arm64.whl", hash = "sha256:69f6ecdf1452139f2b947d0c169a605de578efdb72cbb2373cb0a94edca1fd34", size = 863535 },
+    { url = "https://files.pythonhosted.org/packages/92/77/a72abb16c5cb093980570871aa152e6d47fc9cf2482daeea9687708be655/rapidfuzz-3.12.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:e5fd3ce849b27d063755829cda27a9dab6dbd63be3801f2a40c60ec563a4c90f", size = 1858463 },
+    { url = "https://files.pythonhosted.org/packages/8c/93/06a29076722ef6b05a81132eac9847592185ee97a1dadc7ead2f37334ebe/rapidfuzz-3.12.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:54e53662d71ed660c83c5109127c8e30b9e607884b7c45d2aff7929bbbd00589", size = 1368517 },
+    { url = "https://files.pythonhosted.org/packages/f9/4f/36e8ae37e82a617b8d8da8162744bf69b15091743c3f70699090cb793dd5/rapidfuzz-3.12.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b9e43cf2213e524f3309d329f1ad8dbf658db004ed44f6ae1cd2919aa997da5", size = 1364411 },
+    { url = "https://files.pythonhosted.org/packages/63/f5/ac535622eb163b9a242c40633587916e71f23233bcd6e3d3e70ae2a99a4c/rapidfuzz-3.12.2-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:29ca445e320e5a8df3bd1d75b4fa4ecfa7c681942b9ac65b55168070a1a1960e", size = 5486500 },
+    { url = "https://files.pythonhosted.org/packages/6f/de/87fcb20fda640a2cf0cebe4b0dc3ab970b1ef8a9d48d05363e375fc05982/rapidfuzz-3.12.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:83eb7ef732c2f8533c6b5fbe69858a722c218acc3e1fc190ab6924a8af7e7e0e", size = 3064900 },
+    { url = "https://files.pythonhosted.org/packages/c3/67/c7c4129e8b8b674a7b1d82edc36ed093418fdcf011e3a25150895b24a963/rapidfuzz-3.12.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:648adc2dd2cf873efc23befcc6e75754e204a409dfa77efd0fea30d08f22ef9d", size = 1555181 },
+    { url = "https://files.pythonhosted.org/packages/ee/4d/e910b70839d88d1c38ba806b0ddaa94b478cca8a09f4e7155b2b607c34b2/rapidfuzz-3.12.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:9b1e6f48e1ffa0749261ee23a1c6462bdd0be5eac83093f4711de17a42ae78ad", size = 1860425 },
+    { url = "https://files.pythonhosted.org/packages/fd/62/54914f63e185539fbcca65acb1f7c879740a278d240527ed5ddd40bd7690/rapidfuzz-3.12.2-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:1ae9ded463f2ca4ba1eb762913c5f14c23d2e120739a62b7f4cc102eab32dc90", size = 1369066 },
+    { url = "https://files.pythonhosted.org/packages/56/4a/de2cfab279497d0b2529d3fec398f60cf8e27a51d667b6529081fbdb0af2/rapidfuzz-3.12.2-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dda45f47b559be72ecbce45c7f71dc7c97b9772630ab0f3286d97d2c3025ab71", size = 1365330 },
+    { url = "https://files.pythonhosted.org/packages/dd/48/170c37cfdf04efa34e7cafc688a8517c9098c1d27e1513393ad71bf3165c/rapidfuzz-3.12.2-pp311-pypy311_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3745c6443890265513a3c8777f2de4cb897aeb906a406f97741019be8ad5bcc", size = 5481251 },
+    { url = "https://files.pythonhosted.org/packages/4e/2d/107c489443f6438780d2e40747d5880c8d9374a64e17487eb4085fe7f1f5/rapidfuzz-3.12.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36d3ef4f047ed1bc96fa29289f9e67a637ddca5e4f4d3dc7cb7f50eb33ec1664", size = 3060633 },
+    { url = "https://files.pythonhosted.org/packages/09/f6/fa777f336629aee8938f3d5c95c09df38459d4eadbdbe34642889857fb6a/rapidfuzz-3.12.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:54bb69ebe5ca0bd7527357e348f16a4c0c52fe0c2fcc8a041010467dcb8385f7", size = 1555000 },
+]
+
 [[package]]
 name = "referencing"
 version = "0.36.2"
@@ -2651,6 +3275,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6a/98/e8bc58b178266eae2fcf4c9c7a8303a8d41164d781b32d71097924a6bebe/sqlite_vec-0.1.6-py3-none-win_amd64.whl", hash = "sha256:c65bcfd90fa2f41f9000052bcb8bb75d38240b2dae49225389eca6c3136d3f0c", size = 281540 },
 ]
 
+[[package]]
+name = "sse-starlette"
+version = "2.2.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "starlette" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/71/a4/80d2a11af59fe75b48230846989e93979c892d3a20016b42bb44edb9e398/sse_starlette-2.2.1.tar.gz", hash = "sha256:54470d5f19274aeed6b2d473430b08b4b379ea851d953b11d7f1c4a2c118b419", size = 17376 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d9/e0/5b8bd393f27f4a62461c5cf2479c75a2cc2ffa330976f9f00f5f6e4f50eb/sse_starlette-2.2.1-py3-none-any.whl", hash = "sha256:6410a3d3ba0c89e7675d4c273a301d64649c03a5ef1ca101f10b47f895fd0e99", size = 10120 },
+]
+
 [[package]]
 name = "stack-data"
 version = "0.6.3"
@@ -3202,6 +3839,157 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2d/82/f56956041adef78f849db6b289b282e72b55ab8045a75abad81898c28d19/wrapt-1.17.2-py3-none-any.whl", hash = "sha256:b18f2d1533a71f069c7f82d524a52599053d4c7166e9dd374ae2136b7f40f7c8", size = 23594 },
 ]
 
+[[package]]
+name = "xxhash"
+version = "3.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/00/5e/d6e5258d69df8b4ed8c83b6664f2b47d30d2dec551a29ad72a6c69eafd31/xxhash-3.5.0.tar.gz", hash = "sha256:84f2caddf951c9cbf8dc2e22a89d4ccf5d86391ac6418fe81e3c67d0cf60b45f", size = 84241 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/bb/8a/0e9feca390d512d293afd844d31670e25608c4a901e10202aa98785eab09/xxhash-3.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ece616532c499ee9afbb83078b1b952beffef121d989841f7f4b3dc5ac0fd212", size = 31970 },
+    { url = "https://files.pythonhosted.org/packages/16/e6/be5aa49580cd064a18200ab78e29b88b1127e1a8c7955eb8ecf81f2626eb/xxhash-3.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3171f693dbc2cef6477054a665dc255d996646b4023fe56cb4db80e26f4cc520", size = 30801 },
+    { url = "https://files.pythonhosted.org/packages/20/ee/b8a99ebbc6d1113b3a3f09e747fa318c3cde5b04bd9c197688fadf0eeae8/xxhash-3.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c5d3e570ef46adaf93fc81b44aca6002b5a4d8ca11bd0580c07eac537f36680", size = 220927 },
+    { url = "https://files.pythonhosted.org/packages/58/62/15d10582ef159283a5c2b47f6d799fc3303fe3911d5bb0bcc820e1ef7ff4/xxhash-3.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7cb29a034301e2982df8b1fe6328a84f4b676106a13e9135a0d7e0c3e9f806da", size = 200360 },
+    { url = "https://files.pythonhosted.org/packages/23/41/61202663ea9b1bd8e53673b8ec9e2619989353dba8cfb68e59a9cbd9ffe3/xxhash-3.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d0d307d27099bb0cbeea7260eb39ed4fdb99c5542e21e94bb6fd29e49c57a23", size = 428528 },
+    { url = "https://files.pythonhosted.org/packages/f2/07/d9a3059f702dec5b3b703737afb6dda32f304f6e9da181a229dafd052c29/xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0342aafd421795d740e514bc9858ebddfc705a75a8c5046ac56d85fe97bf196", size = 194149 },
+    { url = "https://files.pythonhosted.org/packages/eb/58/27caadf78226ecf1d62dbd0c01d152ed381c14c1ee4ad01f0d460fc40eac/xxhash-3.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3dbbd9892c5ebffeca1ed620cf0ade13eb55a0d8c84e0751a6653adc6ac40d0c", size = 207703 },
+    { url = "https://files.pythonhosted.org/packages/b1/08/32d558ce23e1e068453c39aed7b3c1cdc690c177873ec0ca3a90d5808765/xxhash-3.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4cc2d67fdb4d057730c75a64c5923abfa17775ae234a71b0200346bfb0a7f482", size = 216255 },
+    { url = "https://files.pythonhosted.org/packages/3f/d4/2b971e2d2b0a61045f842b622ef11e94096cf1f12cd448b6fd426e80e0e2/xxhash-3.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ec28adb204b759306a3d64358a5e5c07d7b1dd0ccbce04aa76cb9377b7b70296", size = 202744 },
+    { url = "https://files.pythonhosted.org/packages/19/ae/6a6438864a8c4c39915d7b65effd85392ebe22710412902487e51769146d/xxhash-3.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1328f6d8cca2b86acb14104e381225a3d7b42c92c4b86ceae814e5c400dbb415", size = 210115 },
+    { url = "https://files.pythonhosted.org/packages/48/7d/b3c27c27d1fc868094d02fe4498ccce8cec9fcc591825c01d6bcb0b4fc49/xxhash-3.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8d47ebd9f5d9607fd039c1fbf4994e3b071ea23eff42f4ecef246ab2b7334198", size = 414247 },
+    { url = "https://files.pythonhosted.org/packages/a1/05/918f9e7d2fbbd334b829997045d341d6239b563c44e683b9a7ef8fe50f5d/xxhash-3.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b96d559e0fcddd3343c510a0fe2b127fbff16bf346dd76280b82292567523442", size = 191419 },
+    { url = "https://files.pythonhosted.org/packages/08/29/dfe393805b2f86bfc47c290b275f0b7c189dc2f4e136fd4754f32eb18a8d/xxhash-3.5.0-cp310-cp310-win32.whl", hash = "sha256:61c722ed8d49ac9bc26c7071eeaa1f6ff24053d553146d5df031802deffd03da", size = 30114 },
+    { url = "https://files.pythonhosted.org/packages/7b/d7/aa0b22c4ebb7c3ccb993d4c565132abc641cd11164f8952d89eb6a501909/xxhash-3.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:9bed5144c6923cc902cd14bb8963f2d5e034def4486ab0bbe1f58f03f042f9a9", size = 30003 },
+    { url = "https://files.pythonhosted.org/packages/69/12/f969b81541ee91b55f1ce469d7ab55079593c80d04fd01691b550e535000/xxhash-3.5.0-cp310-cp310-win_arm64.whl", hash = "sha256:893074d651cf25c1cc14e3bea4fceefd67f2921b1bb8e40fcfeba56820de80c6", size = 26773 },
+    { url = "https://files.pythonhosted.org/packages/b8/c7/afed0f131fbda960ff15eee7f304fa0eeb2d58770fade99897984852ef23/xxhash-3.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:02c2e816896dc6f85922ced60097bcf6f008dedfc5073dcba32f9c8dd786f3c1", size = 31969 },
+    { url = "https://files.pythonhosted.org/packages/8c/0c/7c3bc6d87e5235672fcc2fb42fd5ad79fe1033925f71bf549ee068c7d1ca/xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6027dcd885e21581e46d3c7f682cfb2b870942feeed58a21c29583512c3f09f8", size = 30800 },
+    { url = "https://files.pythonhosted.org/packages/04/9e/01067981d98069eec1c20201f8c145367698e9056f8bc295346e4ea32dd1/xxhash-3.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1308fa542bbdbf2fa85e9e66b1077eea3a88bef38ee8a06270b4298a7a62a166", size = 221566 },
+    { url = "https://files.pythonhosted.org/packages/d4/09/d4996de4059c3ce5342b6e1e6a77c9d6c91acce31f6ed979891872dd162b/xxhash-3.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c28b2fdcee797e1c1961cd3bcd3d545cab22ad202c846235197935e1df2f8ef7", size = 201214 },
+    { url = "https://files.pythonhosted.org/packages/62/f5/6d2dc9f8d55a7ce0f5e7bfef916e67536f01b85d32a9fbf137d4cadbee38/xxhash-3.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:924361811732ddad75ff23e90efd9ccfda4f664132feecb90895bade6a1b4623", size = 429433 },
+    { url = "https://files.pythonhosted.org/packages/d9/72/9256303f10e41ab004799a4aa74b80b3c5977d6383ae4550548b24bd1971/xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89997aa1c4b6a5b1e5b588979d1da048a3c6f15e55c11d117a56b75c84531f5a", size = 194822 },
+    { url = "https://files.pythonhosted.org/packages/34/92/1a3a29acd08248a34b0e6a94f4e0ed9b8379a4ff471f1668e4dce7bdbaa8/xxhash-3.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c4f4e8c59837de103344eb1c8a3851f670309eb5c361f746805c5471b8c88", size = 208538 },
+    { url = "https://files.pythonhosted.org/packages/53/ad/7fa1a109663366de42f724a1cdb8e796a260dbac45047bce153bc1e18abf/xxhash-3.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dbd2ecfbfee70bc1a4acb7461fa6af7748ec2ab08ac0fa298f281c51518f982c", size = 216953 },
+    { url = "https://files.pythonhosted.org/packages/35/02/137300e24203bf2b2a49b48ce898ecce6fd01789c0fcd9c686c0a002d129/xxhash-3.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:25b5a51dc3dfb20a10833c8eee25903fd2e14059e9afcd329c9da20609a307b2", size = 203594 },
+    { url = "https://files.pythonhosted.org/packages/23/03/aeceb273933d7eee248c4322b98b8e971f06cc3880e5f7602c94e5578af5/xxhash-3.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a8fb786fb754ef6ff8c120cb96629fb518f8eb5a61a16aac3a979a9dbd40a084", size = 210971 },
+    { url = "https://files.pythonhosted.org/packages/e3/64/ed82ec09489474cbb35c716b189ddc1521d8b3de12b1b5ab41ce7f70253c/xxhash-3.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a905ad00ad1e1c34fe4e9d7c1d949ab09c6fa90c919860c1534ff479f40fd12d", size = 415050 },
+    { url = "https://files.pythonhosted.org/packages/71/43/6db4c02dcb488ad4e03bc86d70506c3d40a384ee73c9b5c93338eb1f3c23/xxhash-3.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:963be41bcd49f53af6d795f65c0da9b4cc518c0dd9c47145c98f61cb464f4839", size = 192216 },
+    { url = "https://files.pythonhosted.org/packages/22/6d/db4abec29e7a567455344433d095fdb39c97db6955bb4a2c432e486b4d28/xxhash-3.5.0-cp311-cp311-win32.whl", hash = "sha256:109b436096d0a2dd039c355fa3414160ec4d843dfecc64a14077332a00aeb7da", size = 30120 },
+    { url = "https://files.pythonhosted.org/packages/52/1c/fa3b61c0cf03e1da4767213672efe186b1dfa4fc901a4a694fb184a513d1/xxhash-3.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:b702f806693201ad6c0a05ddbbe4c8f359626d0b3305f766077d51388a6bac58", size = 30003 },
+    { url = "https://files.pythonhosted.org/packages/6b/8e/9e6fc572acf6e1cc7ccb01973c213f895cb8668a9d4c2b58a99350da14b7/xxhash-3.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:c4dcb4120d0cc3cc448624147dba64e9021b278c63e34a38789b688fd0da9bf3", size = 26777 },
+    { url = "https://files.pythonhosted.org/packages/07/0e/1bfce2502c57d7e2e787600b31c83535af83746885aa1a5f153d8c8059d6/xxhash-3.5.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:14470ace8bd3b5d51318782cd94e6f94431974f16cb3b8dc15d52f3b69df8e00", size = 31969 },
+    { url = "https://files.pythonhosted.org/packages/3f/d6/8ca450d6fe5b71ce521b4e5db69622383d039e2b253e9b2f24f93265b52c/xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:59aa1203de1cb96dbeab595ded0ad0c0056bb2245ae11fac11c0ceea861382b9", size = 30787 },
+    { url = "https://files.pythonhosted.org/packages/5b/84/de7c89bc6ef63d750159086a6ada6416cc4349eab23f76ab870407178b93/xxhash-3.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08424f6648526076e28fae6ea2806c0a7d504b9ef05ae61d196d571e5c879c84", size = 220959 },
+    { url = "https://files.pythonhosted.org/packages/fe/86/51258d3e8a8545ff26468c977101964c14d56a8a37f5835bc0082426c672/xxhash-3.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:61a1ff00674879725b194695e17f23d3248998b843eb5e933007ca743310f793", size = 200006 },
+    { url = "https://files.pythonhosted.org/packages/02/0a/96973bd325412feccf23cf3680fd2246aebf4b789122f938d5557c54a6b2/xxhash-3.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2f2c61bee5844d41c3eb015ac652a0229e901074951ae48581d58bfb2ba01be", size = 428326 },
+    { url = "https://files.pythonhosted.org/packages/11/a7/81dba5010f7e733de88af9555725146fc133be97ce36533867f4c7e75066/xxhash-3.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d32a592cac88d18cc09a89172e1c32d7f2a6e516c3dfde1b9adb90ab5df54a6", size = 194380 },
+    { url = "https://files.pythonhosted.org/packages/fb/7d/f29006ab398a173f4501c0e4977ba288f1c621d878ec217b4ff516810c04/xxhash-3.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:70dabf941dede727cca579e8c205e61121afc9b28516752fd65724be1355cc90", size = 207934 },
+    { url = "https://files.pythonhosted.org/packages/8a/6e/6e88b8f24612510e73d4d70d9b0c7dff62a2e78451b9f0d042a5462c8d03/xxhash-3.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e5d0ddaca65ecca9c10dcf01730165fd858533d0be84c75c327487c37a906a27", size = 216301 },
+    { url = "https://files.pythonhosted.org/packages/af/51/7862f4fa4b75a25c3b4163c8a873f070532fe5f2d3f9b3fc869c8337a398/xxhash-3.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3e5b5e16c5a480fe5f59f56c30abdeba09ffd75da8d13f6b9b6fd224d0b4d0a2", size = 203351 },
+    { url = "https://files.pythonhosted.org/packages/22/61/8d6a40f288f791cf79ed5bb113159abf0c81d6efb86e734334f698eb4c59/xxhash-3.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:149b7914451eb154b3dfaa721315117ea1dac2cc55a01bfbd4df7c68c5dd683d", size = 210294 },
+    { url = "https://files.pythonhosted.org/packages/17/02/215c4698955762d45a8158117190261b2dbefe9ae7e5b906768c09d8bc74/xxhash-3.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:eade977f5c96c677035ff39c56ac74d851b1cca7d607ab3d8f23c6b859379cab", size = 414674 },
+    { url = "https://files.pythonhosted.org/packages/31/5c/b7a8db8a3237cff3d535261325d95de509f6a8ae439a5a7a4ffcff478189/xxhash-3.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fa9f547bd98f5553d03160967866a71056a60960be00356a15ecc44efb40ba8e", size = 192022 },
+    { url = "https://files.pythonhosted.org/packages/78/e3/dd76659b2811b3fd06892a8beb850e1996b63e9235af5a86ea348f053e9e/xxhash-3.5.0-cp312-cp312-win32.whl", hash = "sha256:f7b58d1fd3551b8c80a971199543379be1cee3d0d409e1f6d8b01c1a2eebf1f8", size = 30170 },
+    { url = "https://files.pythonhosted.org/packages/d9/6b/1c443fe6cfeb4ad1dcf231cdec96eb94fb43d6498b4469ed8b51f8b59a37/xxhash-3.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:fa0cafd3a2af231b4e113fba24a65d7922af91aeb23774a8b78228e6cd785e3e", size = 30040 },
+    { url = "https://files.pythonhosted.org/packages/0f/eb/04405305f290173acc0350eba6d2f1a794b57925df0398861a20fbafa415/xxhash-3.5.0-cp312-cp312-win_arm64.whl", hash = "sha256:586886c7e89cb9828bcd8a5686b12e161368e0064d040e225e72607b43858ba2", size = 26796 },
+    { url = "https://files.pythonhosted.org/packages/c9/b8/e4b3ad92d249be5c83fa72916c9091b0965cb0faeff05d9a0a3870ae6bff/xxhash-3.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:37889a0d13b0b7d739cfc128b1c902f04e32de17b33d74b637ad42f1c55101f6", size = 31795 },
+    { url = "https://files.pythonhosted.org/packages/fc/d8/b3627a0aebfbfa4c12a41e22af3742cf08c8ea84f5cc3367b5de2d039cce/xxhash-3.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:97a662338797c660178e682f3bc180277b9569a59abfb5925e8620fba00b9fc5", size = 30792 },
+    { url = "https://files.pythonhosted.org/packages/c3/cc/762312960691da989c7cd0545cb120ba2a4148741c6ba458aa723c00a3f8/xxhash-3.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f85e0108d51092bdda90672476c7d909c04ada6923c14ff9d913c4f7dc8a3bc", size = 220950 },
+    { url = "https://files.pythonhosted.org/packages/fe/e9/cc266f1042c3c13750e86a535496b58beb12bf8c50a915c336136f6168dc/xxhash-3.5.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd2fd827b0ba763ac919440042302315c564fdb797294d86e8cdd4578e3bc7f3", size = 199980 },
+    { url = "https://files.pythonhosted.org/packages/bf/85/a836cd0dc5cc20376de26b346858d0ac9656f8f730998ca4324921a010b9/xxhash-3.5.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:82085c2abec437abebf457c1d12fccb30cc8b3774a0814872511f0f0562c768c", size = 428324 },
+    { url = "https://files.pythonhosted.org/packages/b4/0e/15c243775342ce840b9ba34aceace06a1148fa1630cd8ca269e3223987f5/xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07fda5de378626e502b42b311b049848c2ef38784d0d67b6f30bb5008642f8eb", size = 194370 },
+    { url = "https://files.pythonhosted.org/packages/87/a1/b028bb02636dfdc190da01951d0703b3d904301ed0ef6094d948983bef0e/xxhash-3.5.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c279f0d2b34ef15f922b77966640ade58b4ccdfef1c4d94b20f2a364617a493f", size = 207911 },
+    { url = "https://files.pythonhosted.org/packages/80/d5/73c73b03fc0ac73dacf069fdf6036c9abad82de0a47549e9912c955ab449/xxhash-3.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:89e66ceed67b213dec5a773e2f7a9e8c58f64daeb38c7859d8815d2c89f39ad7", size = 216352 },
+    { url = "https://files.pythonhosted.org/packages/b6/2a/5043dba5ddbe35b4fe6ea0a111280ad9c3d4ba477dd0f2d1fe1129bda9d0/xxhash-3.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bcd51708a633410737111e998ceb3b45d3dbc98c0931f743d9bb0a209033a326", size = 203410 },
+    { url = "https://files.pythonhosted.org/packages/a2/b2/9a8ded888b7b190aed75b484eb5c853ddd48aa2896e7b59bbfbce442f0a1/xxhash-3.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3ff2c0a34eae7df88c868be53a8dd56fbdf592109e21d4bfa092a27b0bf4a7bf", size = 210322 },
+    { url = "https://files.pythonhosted.org/packages/98/62/440083fafbc917bf3e4b67c2ade621920dd905517e85631c10aac955c1d2/xxhash-3.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:4e28503dccc7d32e0b9817aa0cbfc1f45f563b2c995b7a66c4c8a0d232e840c7", size = 414725 },
+    { url = "https://files.pythonhosted.org/packages/75/db/009206f7076ad60a517e016bb0058381d96a007ce3f79fa91d3010f49cc2/xxhash-3.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a6c50017518329ed65a9e4829154626f008916d36295b6a3ba336e2458824c8c", size = 192070 },
+    { url = "https://files.pythonhosted.org/packages/1f/6d/c61e0668943a034abc3a569cdc5aeae37d686d9da7e39cf2ed621d533e36/xxhash-3.5.0-cp313-cp313-win32.whl", hash = "sha256:53a068fe70301ec30d868ece566ac90d873e3bb059cf83c32e76012c889b8637", size = 30172 },
+    { url = "https://files.pythonhosted.org/packages/96/14/8416dce965f35e3d24722cdf79361ae154fa23e2ab730e5323aa98d7919e/xxhash-3.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:80babcc30e7a1a484eab952d76a4f4673ff601f54d5142c26826502740e70b43", size = 30041 },
+    { url = "https://files.pythonhosted.org/packages/27/ee/518b72faa2073f5aa8e3262408d284892cb79cf2754ba0c3a5870645ef73/xxhash-3.5.0-cp313-cp313-win_arm64.whl", hash = "sha256:4811336f1ce11cac89dcbd18f3a25c527c16311709a89313c3acaf771def2d4b", size = 26801 },
+    { url = "https://files.pythonhosted.org/packages/ab/9a/233606bada5bd6f50b2b72c45de3d9868ad551e83893d2ac86dc7bb8553a/xxhash-3.5.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:2014c5b3ff15e64feecb6b713af12093f75b7926049e26a580e94dcad3c73d8c", size = 29732 },
+    { url = "https://files.pythonhosted.org/packages/0c/67/f75276ca39e2c6604e3bee6c84e9db8a56a4973fde9bf35989787cf6e8aa/xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fab81ef75003eda96239a23eda4e4543cedc22e34c373edcaf744e721a163986", size = 36214 },
+    { url = "https://files.pythonhosted.org/packages/0f/f8/f6c61fd794229cc3848d144f73754a0c107854372d7261419dcbbd286299/xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e2febf914ace002132aa09169cc572e0d8959d0f305f93d5828c4836f9bc5a6", size = 32020 },
+    { url = "https://files.pythonhosted.org/packages/79/d3/c029c99801526f859e6b38d34ab87c08993bf3dcea34b11275775001638a/xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5d3a10609c51da2a1c0ea0293fc3968ca0a18bd73838455b5bca3069d7f8e32b", size = 40515 },
+    { url = "https://files.pythonhosted.org/packages/62/e3/bef7b82c1997579c94de9ac5ea7626d01ae5858aa22bf4fcb38bf220cb3e/xxhash-3.5.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5a74f23335b9689b66eb6dbe2a931a88fcd7a4c2cc4b1cb0edba8ce381c7a1da", size = 30064 },
+]
+
+[[package]]
+name = "yarl"
+version = "1.18.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "idna" },
+    { name = "multidict" },
+    { name = "propcache" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b7/9d/4b94a8e6d2b51b599516a5cb88e5bc99b4d8d4583e468057eaa29d5f0918/yarl-1.18.3.tar.gz", hash = "sha256:ac1801c45cbf77b6c99242eeff4fffb5e4e73a800b5c4ad4fc0be5def634d2e1", size = 181062 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d2/98/e005bc608765a8a5569f58e650961314873c8469c333616eb40bff19ae97/yarl-1.18.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7df647e8edd71f000a5208fe6ff8c382a1de8edfbccdbbfe649d263de07d8c34", size = 141458 },
+    { url = "https://files.pythonhosted.org/packages/df/5d/f8106b263b8ae8a866b46d9be869ac01f9b3fb7f2325f3ecb3df8003f796/yarl-1.18.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c69697d3adff5aa4f874b19c0e4ed65180ceed6318ec856ebc423aa5850d84f7", size = 94365 },
+    { url = "https://files.pythonhosted.org/packages/56/3e/d8637ddb9ba69bf851f765a3ee288676f7cf64fb3be13760c18cbc9d10bd/yarl-1.18.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:602d98f2c2d929f8e697ed274fbadc09902c4025c5a9963bf4e9edfc3ab6f7ed", size = 92181 },
+    { url = "https://files.pythonhosted.org/packages/76/f9/d616a5c2daae281171de10fba41e1c0e2d8207166fc3547252f7d469b4e1/yarl-1.18.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c654d5207c78e0bd6d749f6dae1dcbbfde3403ad3a4b11f3c5544d9906969dde", size = 315349 },
+    { url = "https://files.pythonhosted.org/packages/bb/b4/3ea5e7b6f08f698b3769a06054783e434f6d59857181b5c4e145de83f59b/yarl-1.18.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5094d9206c64181d0f6e76ebd8fb2f8fe274950a63890ee9e0ebfd58bf9d787b", size = 330494 },
+    { url = "https://files.pythonhosted.org/packages/55/f1/e0fc810554877b1b67420568afff51b967baed5b53bcc983ab164eebf9c9/yarl-1.18.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:35098b24e0327fc4ebdc8ffe336cee0a87a700c24ffed13161af80124b7dc8e5", size = 326927 },
+    { url = "https://files.pythonhosted.org/packages/a9/42/b1753949b327b36f210899f2dd0a0947c0c74e42a32de3f8eb5c7d93edca/yarl-1.18.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3236da9272872443f81fedc389bace88408f64f89f75d1bdb2256069a8730ccc", size = 319703 },
+    { url = "https://files.pythonhosted.org/packages/f0/6d/e87c62dc9635daefb064b56f5c97df55a2e9cc947a2b3afd4fd2f3b841c7/yarl-1.18.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2c08cc9b16f4f4bc522771d96734c7901e7ebef70c6c5c35dd0f10845270bcd", size = 310246 },
+    { url = "https://files.pythonhosted.org/packages/e3/ef/e2e8d1785cdcbd986f7622d7f0098205f3644546da7919c24b95790ec65a/yarl-1.18.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:80316a8bd5109320d38eef8833ccf5f89608c9107d02d2a7f985f98ed6876990", size = 319730 },
+    { url = "https://files.pythonhosted.org/packages/fc/15/8723e22345bc160dfde68c4b3ae8b236e868f9963c74015f1bc8a614101c/yarl-1.18.3-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:c1e1cc06da1491e6734f0ea1e6294ce00792193c463350626571c287c9a704db", size = 321681 },
+    { url = "https://files.pythonhosted.org/packages/86/09/bf764e974f1516efa0ae2801494a5951e959f1610dd41edbfc07e5e0f978/yarl-1.18.3-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:fea09ca13323376a2fdfb353a5fa2e59f90cd18d7ca4eaa1fd31f0a8b4f91e62", size = 324812 },
+    { url = "https://files.pythonhosted.org/packages/f6/4c/20a0187e3b903c97d857cf0272d687c1b08b03438968ae8ffc50fe78b0d6/yarl-1.18.3-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e3b9fd71836999aad54084906f8663dffcd2a7fb5cdafd6c37713b2e72be1760", size = 337011 },
+    { url = "https://files.pythonhosted.org/packages/c9/71/6244599a6e1cc4c9f73254a627234e0dad3883ece40cc33dce6265977461/yarl-1.18.3-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:757e81cae69244257d125ff31663249b3013b5dc0a8520d73694aed497fb195b", size = 338132 },
+    { url = "https://files.pythonhosted.org/packages/af/f5/e0c3efaf74566c4b4a41cb76d27097df424052a064216beccae8d303c90f/yarl-1.18.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b1771de9944d875f1b98a745bc547e684b863abf8f8287da8466cf470ef52690", size = 331849 },
+    { url = "https://files.pythonhosted.org/packages/8a/b8/3d16209c2014c2f98a8f658850a57b716efb97930aebf1ca0d9325933731/yarl-1.18.3-cp310-cp310-win32.whl", hash = "sha256:8874027a53e3aea659a6d62751800cf6e63314c160fd607489ba5c2edd753cf6", size = 84309 },
+    { url = "https://files.pythonhosted.org/packages/fd/b7/2e9a5b18eb0fe24c3a0e8bae994e812ed9852ab4fd067c0107fadde0d5f0/yarl-1.18.3-cp310-cp310-win_amd64.whl", hash = "sha256:93b2e109287f93db79210f86deb6b9bbb81ac32fc97236b16f7433db7fc437d8", size = 90484 },
+    { url = "https://files.pythonhosted.org/packages/40/93/282b5f4898d8e8efaf0790ba6d10e2245d2c9f30e199d1a85cae9356098c/yarl-1.18.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:8503ad47387b8ebd39cbbbdf0bf113e17330ffd339ba1144074da24c545f0069", size = 141555 },
+    { url = "https://files.pythonhosted.org/packages/6d/9c/0a49af78df099c283ca3444560f10718fadb8a18dc8b3edf8c7bd9fd7d89/yarl-1.18.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:02ddb6756f8f4517a2d5e99d8b2f272488e18dd0bfbc802f31c16c6c20f22193", size = 94351 },
+    { url = "https://files.pythonhosted.org/packages/5a/a1/205ab51e148fdcedad189ca8dd587794c6f119882437d04c33c01a75dece/yarl-1.18.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:67a283dd2882ac98cc6318384f565bffc751ab564605959df4752d42483ad889", size = 92286 },
+    { url = "https://files.pythonhosted.org/packages/ed/fe/88b690b30f3f59275fb674f5f93ddd4a3ae796c2b62e5bb9ece8a4914b83/yarl-1.18.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d980e0325b6eddc81331d3f4551e2a333999fb176fd153e075c6d1c2530aa8a8", size = 340649 },
+    { url = "https://files.pythonhosted.org/packages/07/eb/3b65499b568e01f36e847cebdc8d7ccb51fff716dbda1ae83c3cbb8ca1c9/yarl-1.18.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b643562c12680b01e17239be267bc306bbc6aac1f34f6444d1bded0c5ce438ca", size = 356623 },
+    { url = "https://files.pythonhosted.org/packages/33/46/f559dc184280b745fc76ec6b1954de2c55595f0ec0a7614238b9ebf69618/yarl-1.18.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c017a3b6df3a1bd45b9fa49a0f54005e53fbcad16633870104b66fa1a30a29d8", size = 354007 },
+    { url = "https://files.pythonhosted.org/packages/af/ba/1865d85212351ad160f19fb99808acf23aab9a0f8ff31c8c9f1b4d671fc9/yarl-1.18.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75674776d96d7b851b6498f17824ba17849d790a44d282929c42dbb77d4f17ae", size = 344145 },
+    { url = "https://files.pythonhosted.org/packages/94/cb/5c3e975d77755d7b3d5193e92056b19d83752ea2da7ab394e22260a7b824/yarl-1.18.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ccaa3a4b521b780a7e771cc336a2dba389a0861592bbce09a476190bb0c8b4b3", size = 336133 },
+    { url = "https://files.pythonhosted.org/packages/19/89/b77d3fd249ab52a5c40859815765d35c91425b6bb82e7427ab2f78f5ff55/yarl-1.18.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2d06d3005e668744e11ed80812e61efd77d70bb7f03e33c1598c301eea20efbb", size = 347967 },
+    { url = "https://files.pythonhosted.org/packages/35/bd/f6b7630ba2cc06c319c3235634c582a6ab014d52311e7d7c22f9518189b5/yarl-1.18.3-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:9d41beda9dc97ca9ab0b9888cb71f7539124bc05df02c0cff6e5acc5a19dcc6e", size = 346397 },
+    { url = "https://files.pythonhosted.org/packages/18/1a/0b4e367d5a72d1f095318344848e93ea70da728118221f84f1bf6c1e39e7/yarl-1.18.3-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ba23302c0c61a9999784e73809427c9dbedd79f66a13d84ad1b1943802eaaf59", size = 350206 },
+    { url = "https://files.pythonhosted.org/packages/b5/cf/320fff4367341fb77809a2d8d7fe75b5d323a8e1b35710aafe41fdbf327b/yarl-1.18.3-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:6748dbf9bfa5ba1afcc7556b71cda0d7ce5f24768043a02a58846e4a443d808d", size = 362089 },
+    { url = "https://files.pythonhosted.org/packages/57/cf/aadba261d8b920253204085268bad5e8cdd86b50162fcb1b10c10834885a/yarl-1.18.3-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:0b0cad37311123211dc91eadcb322ef4d4a66008d3e1bdc404808992260e1a0e", size = 366267 },
+    { url = "https://files.pythonhosted.org/packages/54/58/fb4cadd81acdee6dafe14abeb258f876e4dd410518099ae9a35c88d8097c/yarl-1.18.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0fb2171a4486bb075316ee754c6d8382ea6eb8b399d4ec62fde2b591f879778a", size = 359141 },
+    { url = "https://files.pythonhosted.org/packages/9a/7a/4c571597589da4cd5c14ed2a0b17ac56ec9ee7ee615013f74653169e702d/yarl-1.18.3-cp311-cp311-win32.whl", hash = "sha256:61b1a825a13bef4a5f10b1885245377d3cd0bf87cba068e1d9a88c2ae36880e1", size = 84402 },
+    { url = "https://files.pythonhosted.org/packages/ae/7b/8600250b3d89b625f1121d897062f629883c2f45339623b69b1747ec65fa/yarl-1.18.3-cp311-cp311-win_amd64.whl", hash = "sha256:b9d60031cf568c627d028239693fd718025719c02c9f55df0a53e587aab951b5", size = 91030 },
+    { url = "https://files.pythonhosted.org/packages/33/85/bd2e2729752ff4c77338e0102914897512e92496375e079ce0150a6dc306/yarl-1.18.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1dd4bdd05407ced96fed3d7f25dbbf88d2ffb045a0db60dbc247f5b3c5c25d50", size = 142644 },
+    { url = "https://files.pythonhosted.org/packages/ff/74/1178322cc0f10288d7eefa6e4a85d8d2e28187ccab13d5b844e8b5d7c88d/yarl-1.18.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7c33dd1931a95e5d9a772d0ac5e44cac8957eaf58e3c8da8c1414de7dd27c576", size = 94962 },
+    { url = "https://files.pythonhosted.org/packages/be/75/79c6acc0261e2c2ae8a1c41cf12265e91628c8c58ae91f5ff59e29c0787f/yarl-1.18.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:25b411eddcfd56a2f0cd6a384e9f4f7aa3efee14b188de13048c25b5e91f1640", size = 92795 },
+    { url = "https://files.pythonhosted.org/packages/6b/32/927b2d67a412c31199e83fefdce6e645247b4fb164aa1ecb35a0f9eb2058/yarl-1.18.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:436c4fc0a4d66b2badc6c5fc5ef4e47bb10e4fd9bf0c79524ac719a01f3607c2", size = 332368 },
+    { url = "https://files.pythonhosted.org/packages/19/e5/859fca07169d6eceeaa4fde1997c91d8abde4e9a7c018e371640c2da2b71/yarl-1.18.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e35ef8683211db69ffe129a25d5634319a677570ab6b2eba4afa860f54eeaf75", size = 342314 },
+    { url = "https://files.pythonhosted.org/packages/08/75/76b63ccd91c9e03ab213ef27ae6add2e3400e77e5cdddf8ed2dbc36e3f21/yarl-1.18.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:84b2deecba4a3f1a398df819151eb72d29bfeb3b69abb145a00ddc8d30094512", size = 341987 },
+    { url = "https://files.pythonhosted.org/packages/1a/e1/a097d5755d3ea8479a42856f51d97eeff7a3a7160593332d98f2709b3580/yarl-1.18.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00e5a1fea0fd4f5bfa7440a47eff01d9822a65b4488f7cff83155a0f31a2ecba", size = 336914 },
+    { url = "https://files.pythonhosted.org/packages/0b/42/e1b4d0e396b7987feceebe565286c27bc085bf07d61a59508cdaf2d45e63/yarl-1.18.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d0e883008013c0e4aef84dcfe2a0b172c4d23c2669412cf5b3371003941f72bb", size = 325765 },
+    { url = "https://files.pythonhosted.org/packages/7e/18/03a5834ccc9177f97ca1bbb245b93c13e58e8225276f01eedc4cc98ab820/yarl-1.18.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5a3f356548e34a70b0172d8890006c37be92995f62d95a07b4a42e90fba54272", size = 344444 },
+    { url = "https://files.pythonhosted.org/packages/c8/03/a713633bdde0640b0472aa197b5b86e90fbc4c5bc05b727b714cd8a40e6d/yarl-1.18.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:ccd17349166b1bee6e529b4add61727d3f55edb7babbe4069b5764c9587a8cc6", size = 340760 },
+    { url = "https://files.pythonhosted.org/packages/eb/99/f6567e3f3bbad8fd101886ea0276c68ecb86a2b58be0f64077396cd4b95e/yarl-1.18.3-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b958ddd075ddba5b09bb0be8a6d9906d2ce933aee81100db289badbeb966f54e", size = 346484 },
+    { url = "https://files.pythonhosted.org/packages/8e/a9/84717c896b2fc6cb15bd4eecd64e34a2f0a9fd6669e69170c73a8b46795a/yarl-1.18.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c7d79f7d9aabd6011004e33b22bc13056a3e3fb54794d138af57f5ee9d9032cb", size = 359864 },
+    { url = "https://files.pythonhosted.org/packages/1e/2e/d0f5f1bef7ee93ed17e739ec8dbcb47794af891f7d165fa6014517b48169/yarl-1.18.3-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:4891ed92157e5430874dad17b15eb1fda57627710756c27422200c52d8a4e393", size = 364537 },
+    { url = "https://files.pythonhosted.org/packages/97/8a/568d07c5d4964da5b02621a517532adb8ec5ba181ad1687191fffeda0ab6/yarl-1.18.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ce1af883b94304f493698b00d0f006d56aea98aeb49d75ec7d98cd4a777e9285", size = 357861 },
+    { url = "https://files.pythonhosted.org/packages/7d/e3/924c3f64b6b3077889df9a1ece1ed8947e7b61b0a933f2ec93041990a677/yarl-1.18.3-cp312-cp312-win32.whl", hash = "sha256:f91c4803173928a25e1a55b943c81f55b8872f0018be83e3ad4938adffb77dd2", size = 84097 },
+    { url = "https://files.pythonhosted.org/packages/34/45/0e055320daaabfc169b21ff6174567b2c910c45617b0d79c68d7ab349b02/yarl-1.18.3-cp312-cp312-win_amd64.whl", hash = "sha256:7e2ee16578af3b52ac2f334c3b1f92262f47e02cc6193c598502bd46f5cd1477", size = 90399 },
+    { url = "https://files.pythonhosted.org/packages/30/c7/c790513d5328a8390be8f47be5d52e141f78b66c6c48f48d241ca6bd5265/yarl-1.18.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:90adb47ad432332d4f0bc28f83a5963f426ce9a1a8809f5e584e704b82685dcb", size = 140789 },
+    { url = "https://files.pythonhosted.org/packages/30/aa/a2f84e93554a578463e2edaaf2300faa61c8701f0898725842c704ba5444/yarl-1.18.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:913829534200eb0f789d45349e55203a091f45c37a2674678744ae52fae23efa", size = 94144 },
+    { url = "https://files.pythonhosted.org/packages/c6/fc/d68d8f83714b221a85ce7866832cba36d7c04a68fa6a960b908c2c84f325/yarl-1.18.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ef9f7768395923c3039055c14334ba4d926f3baf7b776c923c93d80195624782", size = 91974 },
+    { url = "https://files.pythonhosted.org/packages/56/4e/d2563d8323a7e9a414b5b25341b3942af5902a2263d36d20fb17c40411e2/yarl-1.18.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88a19f62ff30117e706ebc9090b8ecc79aeb77d0b1f5ec10d2d27a12bc9f66d0", size = 333587 },
+    { url = "https://files.pythonhosted.org/packages/25/c9/cfec0bc0cac8d054be223e9f2c7909d3e8442a856af9dbce7e3442a8ec8d/yarl-1.18.3-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e17c9361d46a4d5addf777c6dd5eab0715a7684c2f11b88c67ac37edfba6c482", size = 344386 },
+    { url = "https://files.pythonhosted.org/packages/ab/5d/4c532190113b25f1364d25f4c319322e86232d69175b91f27e3ebc2caf9a/yarl-1.18.3-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1a74a13a4c857a84a845505fd2d68e54826a2cd01935a96efb1e9d86c728e186", size = 345421 },
+    { url = "https://files.pythonhosted.org/packages/23/d1/6cdd1632da013aa6ba18cee4d750d953104a5e7aac44e249d9410a972bf5/yarl-1.18.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41f7ce59d6ee7741af71d82020346af364949314ed3d87553763a2df1829cc58", size = 339384 },
+    { url = "https://files.pythonhosted.org/packages/9a/c4/6b3c39bec352e441bd30f432cda6ba51681ab19bb8abe023f0d19777aad1/yarl-1.18.3-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f52a265001d830bc425f82ca9eabda94a64a4d753b07d623a9f2863fde532b53", size = 326689 },
+    { url = "https://files.pythonhosted.org/packages/23/30/07fb088f2eefdc0aa4fc1af4e3ca4eb1a3aadd1ce7d866d74c0f124e6a85/yarl-1.18.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:82123d0c954dc58db301f5021a01854a85bf1f3bb7d12ae0c01afc414a882ca2", size = 345453 },
+    { url = "https://files.pythonhosted.org/packages/63/09/d54befb48f9cd8eec43797f624ec37783a0266855f4930a91e3d5c7717f8/yarl-1.18.3-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:2ec9bbba33b2d00999af4631a3397d1fd78290c48e2a3e52d8dd72db3a067ac8", size = 341872 },
+    { url = "https://files.pythonhosted.org/packages/91/26/fd0ef9bf29dd906a84b59f0cd1281e65b0c3e08c6aa94b57f7d11f593518/yarl-1.18.3-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:fbd6748e8ab9b41171bb95c6142faf068f5ef1511935a0aa07025438dd9a9bc1", size = 347497 },
+    { url = "https://files.pythonhosted.org/packages/d9/b5/14ac7a256d0511b2ac168d50d4b7d744aea1c1aa20c79f620d1059aab8b2/yarl-1.18.3-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:877d209b6aebeb5b16c42cbb377f5f94d9e556626b1bfff66d7b0d115be88d0a", size = 359981 },
+    { url = "https://files.pythonhosted.org/packages/ca/b3/d493221ad5cbd18bc07e642894030437e405e1413c4236dd5db6e46bcec9/yarl-1.18.3-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:b464c4ab4bfcb41e3bfd3f1c26600d038376c2de3297760dfe064d2cb7ea8e10", size = 366229 },
+    { url = "https://files.pythonhosted.org/packages/04/56/6a3e2a5d9152c56c346df9b8fb8edd2c8888b1e03f96324d457e5cf06d34/yarl-1.18.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8d39d351e7faf01483cc7ff7c0213c412e38e5a340238826be7e0e4da450fdc8", size = 360383 },
+    { url = "https://files.pythonhosted.org/packages/fd/b7/4b3c7c7913a278d445cc6284e59b2e62fa25e72758f888b7a7a39eb8423f/yarl-1.18.3-cp313-cp313-win32.whl", hash = "sha256:61ee62ead9b68b9123ec24bc866cbef297dd266175d53296e2db5e7f797f902d", size = 310152 },
+    { url = "https://files.pythonhosted.org/packages/f5/d5/688db678e987c3e0fb17867970700b92603cadf36c56e5fb08f23e822a0c/yarl-1.18.3-cp313-cp313-win_amd64.whl", hash = "sha256:578e281c393af575879990861823ef19d66e2b1d0098414855dd367e234f5b3c", size = 315723 },
+    { url = "https://files.pythonhosted.org/packages/f5/4b/a06e0ec3d155924f77835ed2d167ebd3b211a7b0853da1cf8d8414d784ef/yarl-1.18.3-py3-none-any.whl", hash = "sha256:b57f4f58099328dfb26c6a771d09fb20dbbae81d20cfb66141251ea063bd101b", size = 45109 },
+]
+
 [[package]]
 name = "zipp"
 version = "3.21.0"

From feacf89548c487ef98e1ceeac6997c91c9f6bcfa Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Tue, 11 Mar 2025 06:50:46 +0800
Subject: [PATCH 53/58] docs: improve integration test doc (#1502)

# What does this PR do?
[Provide a short summary of what this PR does and why. Link to relevant
issues if applicable.]

It should use `export` for env var for api key.

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

[//]: # (## Documentation)

Signed-off-by: reidliu <reid201711@gmail.com>
Co-authored-by: reidliu <reid201711@gmail.com>
---
 tests/integration/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integration/README.md b/tests/integration/README.md
index c7a8b4722..beb234740 100644
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@@ -55,7 +55,7 @@ Running all inference tests for a number of models:
 TEXT_MODELS=meta-llama/Llama-3.1-8B-Instruct,meta-llama/Llama-3.1-70B-Instruct
 VISION_MODELS=meta-llama/Llama-3.2-11B-Vision-Instruct
 EMBEDDING_MODELS=all-MiniLM-L6-v2
-TOGETHER_API_KEY=...
+export TOGETHER_API_KEY=<together_api_key>
 
 pytest -s -v tests/api/inference/ \
    --stack-config=together \
@@ -67,7 +67,7 @@ pytest -s -v tests/api/inference/ \
 Same thing but instead of using the distribution, use an adhoc stack with just one provider (`fireworks` for inference):
 
 ```bash
-FIREWORKS_API_KEY=...
+export FIREWORKS_API_KEY=<fireworks_api_key>
 
 pytest -s -v tests/api/inference/ \
    --stack-config=inference=fireworks \

From 21e39633d803a2c7d6bfae3fa2002cba283f7428 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Tue, 11 Mar 2025 00:01:03 +0100
Subject: [PATCH 54/58] feat(server): Use system packages for execution (#1252)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

Users prefer to rely on the main CLI rather than invoking the server
through a Python module. Users interact with a high-level CLI rather
than needing to know internal module structures.

Now, when running llama stack run <path-to-config>, the server will
attempt to use the system package or a virtual environment if one is
active.

This also eliminates the current process dependency chain when running
from a virtual environment:

-> llama stack run
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; -> start_env.sh

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
-> python -m server...

Signed-off-by: Sébastien Han <seb@redhat.com>

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan

Run:

```
ollama run llama3.2:3b-instruct-fp16 --keepalive=2m &
llama stack run ./llama_stack/templates/ollama/run.yaml --disable-ipv6
```

Notice that the server starts and shutdowns normally.

[//]: # (## Documentation)

---------

Signed-off-by: Sébastien Han <seb@redhat.com>
Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
---
 llama_stack/cli/stack/run.py              | 52 ++++++++++++++++-------
 llama_stack/distribution/server/server.py | 28 +++++++++---
 2 files changed, 59 insertions(+), 21 deletions(-)

diff --git a/llama_stack/cli/stack/run.py b/llama_stack/cli/stack/run.py
index e5686fb10..1e4f3c5d9 100644
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@@ -56,7 +56,6 @@ class StackRun(Subcommand):
             "--env",
             action="append",
             help="Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times.",
-            default=[],
             metavar="KEY=VALUE",
         )
         self.parser.add_argument(
@@ -74,7 +73,6 @@ class StackRun(Subcommand):
             type=str,
             help="Image Type used during the build. This can be either conda or container or venv.",
             choices=["conda", "container", "venv"],
-            default="conda",
         )
 
     def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
@@ -120,20 +118,42 @@ class StackRun(Subcommand):
         except AttributeError as e:
             self.parser.error(f"failed to parse config file '{config_file}':\n {e}")
 
-        run_args = formulate_run_args(args.image_type, args.image_name, config, template_name)
+        # If neither image type nor image name is provided, assume the server should be run directly
+        # using the current environment packages.
+        if not args.image_type and not args.image_name:
+            logger.info("No image type or image name provided. Assuming environment packages.")
+            from llama_stack.distribution.server.server import main as server_main
 
-        run_args.extend([str(config_file), str(args.port)])
-        if args.disable_ipv6:
-            run_args.append("--disable-ipv6")
+            # Build the server args from the current args passed to the CLI
+            server_args = argparse.Namespace()
+            for arg in vars(args):
+                # If this is a function, avoid passing it
+                # "args" contains:
+                # func=<bound method StackRun._run_stack_run_cmd of <llama_stack.cli.stack.run.StackRun object at 0x10484b010>>
+                if callable(getattr(args, arg)):
+                    continue
+                setattr(server_args, arg, getattr(args, arg))
 
-        for env_var in args.env:
-            if "=" not in env_var:
-                self.parser.error(f"Environment variable '{env_var}' must be in KEY=VALUE format")
-            key, value = env_var.split("=", 1)  # split on first = only
-            if not key:
-                self.parser.error(f"Environment variable '{env_var}' has empty key")
-            run_args.extend(["--env", f"{key}={value}"])
+            # Run the server
+            server_main(server_args)
+        else:
+            run_args = formulate_run_args(args.image_type, args.image_name, config, template_name)
 
-        if args.tls_keyfile and args.tls_certfile:
-            run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile])
-        run_with_pty(run_args)
+            run_args.extend([str(config_file), str(args.port)])
+            if args.disable_ipv6:
+                run_args.append("--disable-ipv6")
+
+            if args.env:
+                for env_var in args.env:
+                    if "=" not in env_var:
+                        self.parser.error(f"Environment variable '{env_var}' must be in KEY=VALUE format")
+                        return
+                    key, value = env_var.split("=", 1)  # split on first = only
+                    if not key:
+                        self.parser.error(f"Environment variable '{env_var}' has empty key")
+                        return
+                    run_args.extend(["--env", f"{key}={value}"])
+
+            if args.tls_keyfile and args.tls_certfile:
+                run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile])
+            run_with_pty(run_args)
diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index f819d446f..6b99d908d 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -17,7 +17,7 @@ import warnings
 from contextlib import asynccontextmanager
 from importlib.metadata import version as parse_version
 from pathlib import Path
-from typing import Any, List, Union
+from typing import Any, List, Optional, Union
 
 import yaml
 from fastapi import Body, FastAPI, HTTPException, Request
@@ -314,11 +314,17 @@ class ClientVersionMiddleware:
         return await self.app(scope, receive, send)
 
 
-def main():
+def main(args: Optional[argparse.Namespace] = None):
     """Start the LlamaStack server."""
     parser = argparse.ArgumentParser(description="Start the LlamaStack server.")
     parser.add_argument(
         "--yaml-config",
+        dest="config",
+        help="(Deprecated) Path to YAML configuration file - use --config instead",
+    )
+    parser.add_argument(
+        "--config",
+        dest="config",
         help="Path to YAML configuration file",
     )
     parser.add_argument(
@@ -348,7 +354,19 @@ def main():
         required="--tls-keyfile" in sys.argv,
     )
 
-    args = parser.parse_args()
+    # Determine whether the server args are being passed by the "run" command, if this is the case
+    # the args will be passed as a Namespace object to the main function, otherwise they will be
+    # parsed from the command line
+    if args is None:
+        args = parser.parse_args()
+
+    # Check for deprecated argument usage
+    if "--yaml-config" in sys.argv:
+        warnings.warn(
+            "The '--yaml-config' argument is deprecated and will be removed in a future version. Use '--config' instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
 
     if args.env:
         for env_pair in args.env:
@@ -360,9 +378,9 @@ def main():
                 logger.error(f"Error: {str(e)}")
                 sys.exit(1)
 
-    if args.yaml_config:
+    if args.config:
         # if the user provided a config file, use it, even if template was specified
-        config_file = Path(args.yaml_config)
+        config_file = Path(args.config)
         if not config_file.exists():
             raise ValueError(f"Config file {config_file} does not exist")
         logger.info(f"Using config file: {config_file}")

From dc84bc755a164f0d52145a836ee5e7231ac6b34a Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Mon, 10 Mar 2025 16:15:17 -0700
Subject: [PATCH 55/58] fix: revert to using faiss for ollama distro (#1530)

This is unfortunate because `sqlite-vec` seems promising. But its PIP
package is not quite complete. It does not have binary for arm64 (I
think, or maybe it even lacks 64 bit builds?) which results in the arm64
container resulting in
```
File "/usr/local/lib/python3.10/site-packages/sqlite_vec/init.py", line 17, in load
    conn.load_extension(loadable_path())
sqlite3.OperationalError: /usr/local/lib/python3.10/site-packages/sqlite_vec/vec0.so: wrong ELF class: ELFCLASS32
```

To get around I tried to install from source via `uv pip install
sqlite-vec --no-binary=sqlite-vec` however it even lacks a source
distribution which makes that impossible.

## Test Plan

Build the container locally using:

```bash
LLAMA_STACK_DIR=. llama stack build --template ollama --image-type container
```

Run the container as:

```
podman run --privileged -it -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
   -v ~/.llama:/root/.llama \
    --env INFERENCE_MODEL=$INFERENCE_MODEL \
    --env OLLAMA_URL=http://host.containers.internal:11434 \
    -v ~/local/llama-stack:/app/llama-stack-source
    localhost/distribution-ollama:dev --port $LLAMA_STACK_PORT
```

Verify the container starts up correctly. Without this patch, it would
encounter the ELFCLASS32 error.
---
 distributions/dependencies.json                  |  2 +-
 .../distributions/self_hosted_distro/ollama.md   |  2 +-
 llama_stack/providers/registry/vector_io.py      |  2 ++
 llama_stack/templates/ollama/build.yaml          |  2 +-
 llama_stack/templates/ollama/ollama.py           | 16 ++++++++--------
 .../templates/ollama/run-with-safety.yaml        |  9 ++++++---
 llama_stack/templates/ollama/run.yaml            |  9 ++++++---
 7 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/distributions/dependencies.json b/distributions/dependencies.json
index 59b0c9e62..97aecc719 100644
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@@ -427,6 +427,7 @@
     "chardet",
     "chromadb-client",
     "datasets",
+    "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
@@ -448,7 +449,6 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
-    "sqlite-vec",
     "tqdm",
     "transformers",
     "uvicorn"
diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md
index a6390de34..9bfa4211c 100644
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@@ -23,7 +23,7 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
 | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
-| vector_io | `inline::sqlite-vec`, `remote::chromadb`, `remote::pgvector` |
+| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
 
 You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.
diff --git a/llama_stack/providers/registry/vector_io.py b/llama_stack/providers/registry/vector_io.py
index b15b71622..8471748d8 100644
--- a/llama_stack/providers/registry/vector_io.py
+++ b/llama_stack/providers/registry/vector_io.py
@@ -34,6 +34,8 @@ def available_providers() -> List[ProviderSpec]:
             config_class="llama_stack.providers.inline.vector_io.faiss.FaissVectorIOConfig",
             api_dependencies=[Api.inference],
         ),
+        # NOTE: sqlite-vec cannot be bundled into the container image because it does not have a
+        # source distribution and the wheels are not available for all platforms.
         InlineProviderSpec(
             api=Api.vector_io,
             provider_type="inline::sqlite-vec",
diff --git a/llama_stack/templates/ollama/build.yaml b/llama_stack/templates/ollama/build.yaml
index 58bd8e854..37b72fc1f 100644
--- a/llama_stack/templates/ollama/build.yaml
+++ b/llama_stack/templates/ollama/build.yaml
@@ -5,7 +5,7 @@ distribution_spec:
     inference:
     - remote::ollama
     vector_io:
-    - inline::sqlite-vec
+    - inline::faiss
     - remote::chromadb
     - remote::pgvector
     safety:
diff --git a/llama_stack/templates/ollama/ollama.py b/llama_stack/templates/ollama/ollama.py
index 16d8a259f..2d753d3e4 100644
--- a/llama_stack/templates/ollama/ollama.py
+++ b/llama_stack/templates/ollama/ollama.py
@@ -13,7 +13,7 @@ from llama_stack.distribution.datatypes import (
     ShieldInput,
     ToolGroupInput,
 )
-from llama_stack.providers.inline.vector_io.sqlite_vec.config import SQLiteVectorIOConfig
+from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
 from llama_stack.providers.remote.inference.ollama import OllamaImplConfig
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
 
@@ -21,7 +21,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin
 def get_distribution_template() -> DistributionTemplate:
     providers = {
         "inference": ["remote::ollama"],
-        "vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"],
+        "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
         "safety": ["inline::llama-guard"],
         "agents": ["inline::meta-reference"],
         "telemetry": ["inline::meta-reference"],
@@ -43,10 +43,10 @@ def get_distribution_template() -> DistributionTemplate:
         provider_type="remote::ollama",
         config=OllamaImplConfig.sample_run_config(),
     )
-    vector_io_provider_sqlite = Provider(
-        provider_id="sqlite-vec",
-        provider_type="inline::sqlite-vec",
-        config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+    vector_io_provider_faiss = Provider(
+        provider_id="faiss",
+        provider_type="inline::faiss",
+        config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
     )
 
     inference_model = ModelInput(
@@ -96,7 +96,7 @@ def get_distribution_template() -> DistributionTemplate:
             "run.yaml": RunConfigSettings(
                 provider_overrides={
                     "inference": [inference_provider],
-                    "vector_io": [vector_io_provider_sqlite],
+                    "vector_io": [vector_io_provider_faiss],
                 },
                 default_models=[inference_model, embedding_model],
                 default_tool_groups=default_tool_groups,
@@ -104,7 +104,7 @@ def get_distribution_template() -> DistributionTemplate:
             "run-with-safety.yaml": RunConfigSettings(
                 provider_overrides={
                     "inference": [inference_provider],
-                    "vector_io": [vector_io_provider_sqlite],
+                    "vector_io": [vector_io_provider_faiss],
                     "safety": [
                         Provider(
                             provider_id="llama-guard",
diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml
index c8d5a22a4..a96031272 100644
--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@@ -17,10 +17,13 @@ providers:
     config:
       url: ${env.OLLAMA_URL:http://localhost:11434}
   vector_io:
-  - provider_id: sqlite-vec
-    provider_type: inline::sqlite-vec
+  - provider_id: faiss
+    provider_type: inline::faiss
     config:
-      db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/sqlite_vec.db
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/faiss_store.db
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml
index fa21170d2..661d880a7 100644
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@@ -17,10 +17,13 @@ providers:
     config:
       url: ${env.OLLAMA_URL:http://localhost:11434}
   vector_io:
-  - provider_id: sqlite-vec
-    provider_type: inline::sqlite-vec
+  - provider_id: faiss
+    provider_type: inline::faiss
     config:
-      db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/sqlite_vec.db
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/faiss_store.db
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard

From ff853ccc3854320cbc069b226a5fb7bbf1186c8f Mon Sep 17 00:00:00 2001
From: Courtney Pacheco <6019922+courtneypacheco@users.noreply.github.com>
Date: Mon, 10 Mar 2025 19:30:28 -0400
Subject: [PATCH 56/58] fix: Use `--with-editable` to capture accurate code
 coverage reporting (#1532)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?
I created a PR earlier today, but I realized the code coverage reporting
isn't correct: #1512

Essentially, we need to use `--with-editable` to enable develop/editable
mode through `uv`. Using editable mode will create a package.egg-link
file, and that allows pytest to accurately capture code coverage.

Before, some files had "0%" or "100%" coverage, which isn't accurate:

<img width="1455" alt="Screenshot 2025-03-10 at 10 01 53 AM"
src="https://github.com/user-attachments/assets/c425515a-9ecd-4962-a2d4-18cd16d12f25"
/>

More info on `--with-editable`:
https://docs.astral.sh/uv/reference/cli/#uv-run--with-editable

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
Tested locally

<img width="775" alt="Screenshot 2025-03-10 at 7 00 14 PM"
src="https://github.com/user-attachments/assets/31141318-5cf6-4666-8676-b5d8c8d2e719"
/>

Screenshot from CI:

<img width="1000" alt="Screenshot 2025-03-10 at 7 07 57 PM"
src="https://github.com/user-attachments/assets/47092909-ff8d-4e97-80dc-2a16d948405a"
/>

[//]: # (## Documentation)

Signed-off-by: Courtney Pacheco <6019922+courtneypacheco@users.noreply.github.com>
---
 .github/workflows/unit-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 075aa8527..48658047f 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -23,7 +23,7 @@ jobs:
 
       - name: Run unit tests
         run: |
-          uv run -p 3.10.16 --with . --with ".[dev]" --with ".[unit]" pytest --cov=. -s -v tests/unit/ --junitxml=pytest-report.xml
+          uv run -p 3.10.16 --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest --cov=llama_stack -s -v tests/unit/ --junitxml=pytest-report.xml
 
       - name: Upload test results
         if: always()

From e3edca77391ebc73af7fad0ea4b5d4132961c067 Mon Sep 17 00:00:00 2001
From: Botao Chen <markchen1015@meta.com>
Date: Mon, 10 Mar 2025 20:38:28 -0700
Subject: [PATCH 57/58] feat: [new open benchmark] Math 500  (#1538)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What does this PR do?
Created a new math_500 open-benchmark based on OpenAI's [Let's Verify
Step by Step](https://arxiv.org/abs/2305.20050) paper and hugging face's
[HuggingFaceH4/MATH-500](https://huggingface.co/datasets/HuggingFaceH4/MATH-500)
dataset.

The challenge part of this benchmark is to parse the generated and
expected answer and verify if they are same. For the parsing part, we
refer to [Minerva: Solving Quantitative Reasoning Problems with Language
Models](https://research.google/blog/minerva-solving-quantitative-reasoning-problems-with-language-models/).

To simply the parse logic, as the next step, we plan to also refer to
what [simple-eval](https://github.com/openai/simple-evals) is doing,
using llm as judge to check if the generated answer matches the expected
answer or not


## Test Plan
on sever side, spin up a server with open-benchmark template `llama
stack run llama_stack/templates/open-benchamrk/run.yaml`

on client side, issue an open benchmark eval request `llama-stack-client
--endpoint xxx eval run-benchmark "meta-reference-math-500" --model-id
"meta-llama/Llama-3.3-70B-Instruct" --output-dir "/home/markchen1015/"
--num-examples 20` and get ther aggregated eval results
<img width="238" alt="Screenshot 2025-03-10 at 7 57 04 PM"
src="https://github.com/user-attachments/assets/2c9da042-3b70-470e-a7c4-69f4cc24d1fb"
/>

check the generated answer and the related scoring and they make sense
---
 .../providers/inline/scoring/basic/scoring.py |   3 +-
 .../fn_defs/regex_parser_math_response.py     |  27 ++
 .../regex_parser_math_response_scoring_fn.py  |  66 ++++
 .../inline/scoring/basic/utils/math_utils.py  | 330 ++++++++++++++++++
 .../utils/scoring/basic_scoring_utils.py      |  26 ++
 llama_stack/templates/open-benchmark/run.yaml |  20 +-
 6 files changed, 470 insertions(+), 2 deletions(-)
 create mode 100644 llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py
 create mode 100644 llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py
 create mode 100644 llama_stack/providers/inline/scoring/basic/utils/math_utils.py
 create mode 100644 llama_stack/providers/utils/scoring/basic_scoring_utils.py

diff --git a/llama_stack/providers/inline/scoring/basic/scoring.py b/llama_stack/providers/inline/scoring/basic/scoring.py
index 13cd78243..00945b99d 100644
--- a/llama_stack/providers/inline/scoring/basic/scoring.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring.py
@@ -23,10 +23,11 @@ from llama_stack.providers.utils.common.data_schema_validator import (
 
 from .config import BasicScoringConfig
 from .scoring_fn.equality_scoring_fn import EqualityScoringFn
+from .scoring_fn.regex_parser_math_response_scoring_fn import RegexParserMathResponseScoringFn
 from .scoring_fn.regex_parser_scoring_fn import RegexParserScoringFn
 from .scoring_fn.subset_of_scoring_fn import SubsetOfScoringFn
 
-FIXED_FNS = [EqualityScoringFn, SubsetOfScoringFn, RegexParserScoringFn]
+FIXED_FNS = [EqualityScoringFn, SubsetOfScoringFn, RegexParserScoringFn, RegexParserMathResponseScoringFn]
 
 
 class BasicScoringImpl(
diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py
new file mode 100644
index 000000000..8b1bf5352
--- /dev/null
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py
@@ -0,0 +1,27 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.scoring_functions import (
+    AggregationFunctionType,
+    RegexParserScoringFnParams,
+    ScoringFn,
+)
+
+MATH_ANSWER_REGEXES = [r".*final answer is:?\s*\$\\boxed{(?P<X>.*)}\$"]
+
+
+regex_parser_math_response = ScoringFn(
+    identifier="basic::regex_parser_math_response",
+    description="For math related benchmarks, extract answer from the generated response and expected_answer and see if they match",
+    return_type=NumberType(),
+    provider_id="basic",
+    provider_resource_id="regex-parser-math-response",
+    params=RegexParserScoringFnParams(
+        parsing_regexes=MATH_ANSWER_REGEXES,
+        aggregation_functions=[AggregationFunctionType.accuracy],
+    ),
+)
diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py
new file mode 100644
index 000000000..d6c78a9ac
--- /dev/null
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py
@@ -0,0 +1,66 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from typing import Any, Dict, Optional
+
+from llama_stack.apis.scoring import ScoringResultRow
+from llama_stack.apis.scoring_functions import ScoringFnParams, ScoringFnParamsType
+from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
+
+from ..utils.math_utils import first_answer, normalize_final_answer, try_evaluate_frac, try_evaluate_latex
+from .fn_defs.regex_parser_math_response import (
+    regex_parser_math_response,
+)
+
+
+class RegexParserMathResponseScoringFn(RegisteredBaseScoringFn):
+    """
+    A scoring_fn for math benchamrks that parses answer from generated response according to context and check match with expected_answer.
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.supported_fn_defs_registry = {
+            regex_parser_math_response.identifier: regex_parser_math_response,
+        }
+
+    async def score_row(
+        self,
+        input_row: Dict[str, Any],
+        scoring_fn_identifier: Optional[str] = None,
+        scoring_params: Optional[ScoringFnParams] = None,
+    ) -> ScoringResultRow:
+        assert scoring_fn_identifier is not None, "Scoring function identifier not found."
+        fn_def = self.supported_fn_defs_registry[scoring_fn_identifier]
+        if scoring_params is not None:
+            fn_def.params = scoring_params
+
+        assert fn_def.params is not None and fn_def.params.type == ScoringFnParamsType.regex_parser.value, (
+            f"RegexParserScoringFnParams not found for {fn_def}."
+        )
+
+        expected_answer = input_row["expected_answer"]
+        generated_answer = input_row["generated_answer"]
+
+        parsing_regexes = fn_def.params.parsing_regexes
+        assert len(parsing_regexes) == 1, (
+            "Only one parsing regex is supported for regex_parser_math_response scoring function."
+        )
+        parsing_regexes = fn_def.params.parsing_regexes[0]
+
+        normalized_generated_answer = normalize_final_answer(
+            first_answer(generated_answer),
+            parsing_regexes,
+            match_first=True,
+        )
+        normalized_generated_answer = try_evaluate_frac(try_evaluate_latex(normalized_generated_answer))
+
+        normalized_expected_answer = normalize_final_answer(expected_answer, r".*")
+        normalized_expected_answer = try_evaluate_frac(try_evaluate_latex(normalized_expected_answer))
+
+        score = 1.0 if normalized_generated_answer == normalized_expected_answer else 0.0
+        return {
+            "score": score,
+        }
diff --git a/llama_stack/providers/inline/scoring/basic/utils/math_utils.py b/llama_stack/providers/inline/scoring/basic/utils/math_utils.py
new file mode 100644
index 000000000..e11fc625b
--- /dev/null
+++ b/llama_stack/providers/inline/scoring/basic/utils/math_utils.py
@@ -0,0 +1,330 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import re
+from typing import Sequence
+
+from llama_stack.providers.utils.scoring.basic_scoring_utils import time_limit
+
+# from minerva
+SUBSTITUTIONS = [
+    ("an ", ""),
+    ("a ", ""),
+    (".$", "$"),
+    ("\\$", ""),
+    (r"\ ", ""),
+    (" ", ""),
+    ("mbox", "text"),
+    (",\\text{and}", ","),
+    ("\\text{and}", ","),
+    ("\\text{m}", "\\text{}"),
+]
+
+REMOVED_EXPRESSIONS = [
+    "square",
+    "ways",
+    "integers",
+    "dollars",
+    "mph",
+    "inches",
+    "ft",
+    "hours",
+    "km",
+    "units",
+    "\\ldots",
+    "sue",
+    "points",
+    "feet",
+    "minutes",
+    "digits",
+    "cents",
+    "degrees",
+    "cm",
+    "gm",
+    "pounds",
+    "meters",
+    "meals",
+    "edges",
+    "students",
+    "childrentickets",
+    "multiples",
+    "\\text{s}",
+    "\\text{.}",
+    "\\text{\ns}",
+    "\\text{}^2",
+    "\\text{}^3",
+    "\\text{\n}",
+    "\\text{}",
+    r"\mathrm{th}",
+    r"^\circ",
+    r"^{\circ}",
+    r"\;",
+    r",\!",
+    "{,}",
+    '"',
+    "\\dots",
+]
+
+
+def try_evaluate_frac(expression: str, fmt: str = "0.2e") -> str:
+    if isinstance(expression, float):
+        return expression
+    new_expression = f"{expression}"
+    regex = re.compile(r"\\frac{([^}]+)}{([^}]+)}")
+    for match in re.finditer(regex, expression):
+        try:
+            value = float(match.group(1)) / float(match.group(2))
+            new_expression = new_expression.replace(
+                match.group(),
+                f"{{value:{fmt}}}".format(value=value),
+                1,
+            )
+        except Exception:
+            continue
+    return new_expression
+
+
+def try_evaluate_latex(expression: str, fmt: str = ".2e") -> str:
+    try:
+        with time_limit(seconds=5):
+            from sympy.parsing.latex import parse_latex
+
+            value = parse_latex(expression).evalf()  # type: ignore
+            return f"{{value:{fmt}}}".format(value=value)
+    except Exception:
+        return expression
+
+
+def first_answer(text: str, markers: Sequence[str] = ("Q:", "A:")) -> str:
+    for marker in markers:
+        text = text.split(marker)[0]
+    return text
+
+
+def extract_result_from_boxed(answer: str) -> str:
+    box_start = "\\boxed"
+    # format is `\\boxed <value>$` or `\\boxed{<value>}`, with potential white spaces framing `<value>`
+    start = answer.rfind(box_start)
+    if start < 0:
+        return ""
+    answer = answer[start + len(box_start) :].strip()
+    ends_with_curly = answer.startswith("{")
+    i = 0
+    open_braces = 0
+    while i < len(answer):
+        if answer[i] == "{":
+            open_braces += 1
+        elif answer[i] == "}":
+            open_braces -= 1
+        if open_braces == 0:
+            if ends_with_curly:
+                answer = answer[: i + 1].strip()
+                break
+            elif answer[i] == "$":
+                answer = answer[:i].strip()
+                break
+        i += 1
+    else:
+        return ""
+    # remove extra curly braces
+    while True:
+        if answer.startswith("{") and answer.endswith("}"):
+            answer = answer[1:-1].strip()
+        else:
+            break
+    return answer
+
+
+# from minerva paper + _normalise_result from xavierm
+def normalize_final_answer(final_answer: str, regex_pattern: str, match_first: bool = True) -> str:
+    """Extract and normalize a final answer to a quantitative reasoning question."""
+    match = re.findall(regex_pattern, final_answer)
+    extraction: str
+    if len(match) > 0:
+        if match_first:
+            extraction = match[0]
+        else:
+            extraction = match[-1]
+    else:
+        extraction = extract_result_from_boxed(final_answer)
+
+    if len(extraction) == 0:
+        return final_answer
+    else:
+        final_answer = extraction
+    final_answer = final_answer.split("=")[-1]
+    for before, after in SUBSTITUTIONS:
+        final_answer = final_answer.replace(before, after)
+    for expr in REMOVED_EXPRESSIONS:
+        final_answer = final_answer.replace(expr, "")
+    # Extract answer that is in LaTeX math, is bold,
+    # is surrounded by a box, etc.
+    final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer)
+    final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer)
+    # Normalize shorthand TeX:
+    # \fracab -> \frac{a}{b}
+    # \frac{abc}{bef} -> \frac{abc}{bef}
+    # \fracabc -> \frac{a}{b}c
+    # \sqrta -> \sqrt{a}
+    # \sqrtab -> sqrt{a}b
+    final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer)
+    final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer)
+    final_answer = final_answer.replace("$", "")
+    # Normalize 100,000 -> 100000
+    if final_answer.replace(",", "").isdigit():
+        final_answer = final_answer.replace(",", "")
+    # If the final answer is a single letter in parentheses, remove the parentheses
+    # Example: (a) -> a (but not (ab) -> ab)
+    if re.match(r"\([a-zA-Z]\)", final_answer):
+        final_answer = final_answer[1]
+    return _normalise_result(final_answer)
+
+
+def _normalise_result(string: str) -> str:
+    # linebreaks
+    string = string.replace("\n", "")
+
+    # remove inverse spaces
+    string = string.replace("\\!", "")
+
+    # replace \\ with \
+    string = string.replace("\\\\", "\\")
+
+    # replace tfrac and dfrac with frac
+    string = string.replace("cfrac", "frac")
+    string = string.replace("tfrac", "frac")
+    string = string.replace("dfrac", "frac")
+
+    # remove \left and \right
+    string = string.replace("\\left", "")
+    string = string.replace("\\le", "")
+    string = string.replace("\\right", "")
+
+    # Remove circ (degrees)
+    string = string.replace("^{\\circ}", "")
+    string = string.replace("^\\circ", "")
+
+    # remove dollar signs
+    string = string.replace("\\$", "")
+
+    # remove units (on the right)
+    string = _remove_right_units(string)
+
+    # remove percentage
+    string = string.replace("\\%", "")
+    string = string.replace(r"\%", "")
+
+    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+    string = string.replace(" .", " 0.")
+    string = string.replace("{.", "{0.")
+    # if empty, return empty string
+    if len(string) == 0:
+        return string
+    if string[0] == ".":
+        string = "0" + string
+
+    # to consider: get rid of e.g. "k = " or "q = " at beginning
+    string = string.split("=")[-1]
+
+    # fix sqrt3 --> sqrt{3}
+    string = _fix_sqrt(string)
+
+    # remove spaces
+    string = string.replace(" ", "")
+
+    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
+    string = _fix_fracs(string)
+
+    # manually change 0.5 --> \frac{1}{2}
+    if string == "0.5":
+        string = "\\frac{1}{2}"
+
+    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+    string = _fix_a_slash_b(string)
+
+    return string
+
+
+def _remove_right_units(string: str) -> str:
+    # "\\text{ " only ever occurs (at least in the val set) when describing units
+    try:
+        if "\\text{ " in string:
+            splits = string.split("\\text{ ")
+            assert len(splits) == 2
+            return splits[0]
+        else:
+            return string
+    except AssertionError:
+        return string
+
+
+def _fix_sqrt(string: str) -> str:
+    if "\\sqrt" not in string:
+        return string
+    splits = string.split("\\sqrt")
+    new_string = splits[0]
+    for split in splits[1:]:
+        if len(split) == 0:
+            return string
+        if split[0] != "{":
+            a = split[0]
+            new_substr = "\\sqrt{" + a + "}" + split[1:]
+        else:
+            new_substr = "\\sqrt" + split
+        new_string += new_substr
+    return new_string
+
+
+def _fix_fracs(string: str) -> str:
+    substrs = string.split("\\frac")
+    new_str = substrs[0]
+    if len(substrs) > 1:
+        substrs = substrs[1:]
+        for substr in substrs:
+            new_str += "\\frac"
+            if len(substr) == 0:
+                return string
+            if substr[0] == "{":
+                new_str += substr
+            else:
+                try:
+                    assert len(substr) >= 2
+                except AssertionError:
+                    return string
+                a = substr[0]
+                b = substr[1]
+                if b != "{":
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}{" + b + "}" + post_substr
+                    else:
+                        new_str += "{" + a + "}{" + b + "}"
+                else:
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}" + b + post_substr
+                    else:
+                        new_str += "{" + a + "}" + b
+    string = new_str
+    return string
+
+
+def _fix_a_slash_b(string: str) -> str:
+    if len(string.split("/")) != 2:
+        return string
+    a = string.split("/")[0]
+    b = string.split("/")[1]
+    try:
+        ia = int(a)
+        ib = int(b)
+        assert string == "{}/{}".format(ia, ib)
+        new_string = "\\frac{" + str(ia) + "}{" + str(ib) + "}"
+        return new_string
+    except (ValueError, AssertionError):
+        return string
diff --git a/llama_stack/providers/utils/scoring/basic_scoring_utils.py b/llama_stack/providers/utils/scoring/basic_scoring_utils.py
new file mode 100644
index 000000000..91abfdb2e
--- /dev/null
+++ b/llama_stack/providers/utils/scoring/basic_scoring_utils.py
@@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import contextlib
+import signal
+from types import FrameType
+from typing import Iterator, Optional
+
+
+class TimeoutError(Exception):
+    pass
+
+
+@contextlib.contextmanager
+def time_limit(seconds: float) -> Iterator[None]:
+    def signal_handler(signum: int, frame: Optional[FrameType]) -> None:
+        raise TimeoutError("Timed out!")
+
+    signal.setitimer(signal.ITIMER_REAL, seconds)
+    signal.signal(signal.SIGALRM, signal_handler)
+    try:
+        yield
+    finally:
+        signal.setitimer(signal.ITIMER_REAL, 0)
diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml
index 47a2f2eb5..736b47746 100644
--- a/llama_stack/templates/open-benchmark/run.yaml
+++ b/llama_stack/templates/open-benchmark/run.yaml
@@ -33,7 +33,7 @@ providers:
     provider_type: remote::together
     config:
       url: https://api.together.xyz/v1
-      api_key: ${env.TOGETHER_API_KEY}
+      api_key: ${env.TOGETHER_API_KEY:}
   vector_io:
   - provider_id: sqlite-vec
     provider_type: inline::sqlite-vec
@@ -190,6 +190,21 @@ datasets:
         type: string
       chat_completion_input:
         type: string
+  - dataset_id: math_500
+    provider_id: huggingface
+    url:
+      uri: https://huggingface.co/datasets/llamastack/math_500
+    metadata:
+      path: llamastack/math_500
+      name:
+      split: test
+    dataset_schema:
+      input_query:
+        type: string
+      expected_answer:
+        type: string
+      chat_completion_input:
+        type: string
 scoring_fns: []
 benchmarks:
   - benchmark_id: meta-reference-simpleqa
@@ -201,6 +216,9 @@ benchmarks:
   - benchmark_id: meta-reference-gpqa-cot
     dataset_id: gpqa_cot
     scoring_functions: ["basic::regex_parser_multiple_choice_answer"]
+  - benchmark_id: meta-reference-math-500
+    dataset_id: math_500
+    scoring_functions: ["basic::regex_parser_math_response"]
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search

From ead9397e22a8688268d2f9614e27e308fd638eee Mon Sep 17 00:00:00 2001
From: Dinesh Yeduguru <yvdinesh@gmail.com>
Date: Tue, 11 Mar 2025 07:12:48 -0700
Subject: [PATCH 58/58] fix: tracing fixes for trace context propogation across
 coroutines (#1522)

# What does this PR do?
This PR has two fixes needed for correct trace context propagation
across asycnio boundary
Fix 1: Start using context vars to store the global trace context.
This is needed since we cannot use the same trace context across
coroutines since the state is shared. each coroutine
should have its own trace context so that each of it can start storing
its state correctly.
Fix 2: Start a new span for each new coroutines started for running
shields to keep the span tree clean


## Test Plan

### Integration tests with server
LLAMA_STACK_DISABLE_VERSION_CHECK=true llama stack run
~/.llama/distributions/together/together-run.yaml
LLAMA_STACK_CONFIG=http://localhost:8321 pytest -s --safety-shield
meta-llama/Llama-Guard-3-8B --text-model
meta-llama/Llama-3.1-8B-Instruct
server logs:
https://gist.github.com/dineshyv/51ac5d9864ed031d0d89ce77352821fe
test logs:
https://gist.github.com/dineshyv/e66acc1c4648a42f1854600609c467f3

### Integration tests with library client
LLAMA_STACK_CONFIG=fireworks pytest -s --safety-shield
meta-llama/Llama-Guard-3-8B --text-model
meta-llama/Llama-3.1-8B-Instruct

logs: https://gist.github.com/dineshyv/ca160696a0b167223378673fb1dcefb8

### Apps test with server:
```
LLAMA_STACK_DISABLE_VERSION_CHECK=true llama stack run ~/.llama/distributions/together/together-run.yaml
python -m examples.agents.e2e_loop_with_client_tools localhost 8321
```
server logs:
https://gist.github.com/dineshyv/1717a572d8f7c14279c36123b79c5797
app logs:
https://gist.github.com/dineshyv/44167e9f57806a0ba3b710c32aec02f8
---
 .../agents/meta_reference/agent_instance.py   | 10 +--
 .../inline/agents/meta_reference/safety.py    | 12 ++--
 .../providers/utils/telemetry/tracing.py      | 67 ++++++++++++-------
 3 files changed, 54 insertions(+), 35 deletions(-)

diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
index 3619b3f67..fedd695c1 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@@ -181,7 +181,7 @@ class ChatAgent(ShieldRunnerMixin):
         return messages
 
     async def create_and_execute_turn(self, request: AgentTurnCreateRequest) -> AsyncGenerator:
-        with tracing.span("create_and_execute_turn") as span:
+        async with tracing.span("create_and_execute_turn") as span:
             span.set_attribute("session_id", request.session_id)
             span.set_attribute("agent_id", self.agent_id)
             span.set_attribute("request", request.model_dump_json())
@@ -191,7 +191,7 @@ class ChatAgent(ShieldRunnerMixin):
                 yield chunk
 
     async def resume_turn(self, request: AgentTurnResumeRequest) -> AsyncGenerator:
-        with tracing.span("resume_turn") as span:
+        async with tracing.span("resume_turn") as span:
             span.set_attribute("agent_id", self.agent_id)
             span.set_attribute("session_id", request.session_id)
             span.set_attribute("turn_id", request.turn_id)
@@ -390,7 +390,7 @@ class ChatAgent(ShieldRunnerMixin):
         shields: List[str],
         touchpoint: str,
     ) -> AsyncGenerator:
-        with tracing.span("run_shields") as span:
+        async with tracing.span("run_shields") as span:
             span.set_attribute("input", [m.model_dump_json() for m in messages])
             if len(shields) == 0:
                 span.set_attribute("output", "no shields")
@@ -508,7 +508,7 @@ class ChatAgent(ShieldRunnerMixin):
             content = ""
             stop_reason = None
 
-            with tracing.span("inference") as span:
+            async with tracing.span("inference") as span:
                 async for chunk in await self.inference_api.chat_completion(
                     self.agent_config.model,
                     input_messages,
@@ -685,7 +685,7 @@ class ChatAgent(ShieldRunnerMixin):
                 tool_name = tool_call.tool_name
                 if isinstance(tool_name, BuiltinTool):
                     tool_name = tool_name.value
-                with tracing.span(
+                async with tracing.span(
                     "tool_execution",
                     {
                         "tool_name": tool_name,
diff --git a/llama_stack/providers/inline/agents/meta_reference/safety.py b/llama_stack/providers/inline/agents/meta_reference/safety.py
index 2497be070..bef16eaba 100644
--- a/llama_stack/providers/inline/agents/meta_reference/safety.py
+++ b/llama_stack/providers/inline/agents/meta_reference/safety.py
@@ -10,6 +10,7 @@ from typing import List
 
 from llama_stack.apis.inference import Message
 from llama_stack.apis.safety import Safety, SafetyViolation, ViolationLevel
+from llama_stack.providers.utils.telemetry import tracing
 
 log = logging.getLogger(__name__)
 
@@ -32,15 +33,14 @@ class ShieldRunnerMixin:
         self.output_shields = output_shields
 
     async def run_multiple_shields(self, messages: List[Message], identifiers: List[str]) -> None:
-        responses = await asyncio.gather(
-            *[
-                self.safety_api.run_shield(
+        async def run_shield_with_span(identifier: str):
+            async with tracing.span(f"run_shield_{identifier}"):
+                return await self.safety_api.run_shield(
                     shield_id=identifier,
                     messages=messages,
                 )
-                for identifier in identifiers
-            ]
-        )
+
+        responses = await asyncio.gather(*[run_shield_with_span(identifier) for identifier in identifiers])
         for identifier, response in zip(identifiers, responses, strict=False):
             if not response.violation:
                 continue
diff --git a/llama_stack/providers/utils/telemetry/tracing.py b/llama_stack/providers/utils/telemetry/tracing.py
index d84024941..bef229080 100644
--- a/llama_stack/providers/utils/telemetry/tracing.py
+++ b/llama_stack/providers/utils/telemetry/tracing.py
@@ -6,6 +6,7 @@
 
 import asyncio
 import base64
+import contextvars
 import logging
 import queue
 import threading
@@ -24,9 +25,10 @@ from llama_stack.apis.telemetry import (
     Telemetry,
     UnstructuredLogEvent,
 )
+from llama_stack.log import get_logger
 from llama_stack.providers.utils.telemetry.trace_protocol import serialize_value
 
-log = logging.getLogger(__name__)
+logger = get_logger(__name__, category="core")
 
 
 def generate_short_uuid(len: int = 8):
@@ -36,7 +38,7 @@ def generate_short_uuid(len: int = 8):
     return encoded.rstrip(b"=").decode("ascii")[:len]
 
 
-CURRENT_TRACE_CONTEXT = None
+CURRENT_TRACE_CONTEXT = contextvars.ContextVar("trace_context", default=None)
 BACKGROUND_LOGGER = None
 
 
@@ -51,7 +53,7 @@ class BackgroundLogger:
         try:
             self.log_queue.put_nowait(event)
         except queue.Full:
-            log.error("Log queue is full, dropping event")
+            logger.error("Log queue is full, dropping event")
 
     def _process_logs(self):
         while True:
@@ -129,35 +131,36 @@ def setup_logger(api: Telemetry, level: int = logging.INFO):
 
     if BACKGROUND_LOGGER is None:
         BACKGROUND_LOGGER = BackgroundLogger(api)
-    logger = logging.getLogger()
-    logger.setLevel(level)
-    logger.addHandler(TelemetryHandler())
+    root_logger = logging.getLogger()
+    root_logger.setLevel(level)
+    root_logger.addHandler(TelemetryHandler())
 
 
 async def start_trace(name: str, attributes: Dict[str, Any] = None) -> TraceContext:
     global CURRENT_TRACE_CONTEXT, BACKGROUND_LOGGER
 
     if BACKGROUND_LOGGER is None:
-        log.info("No Telemetry implementation set. Skipping trace initialization...")
+        logger.debug("No Telemetry implementation set. Skipping trace initialization...")
         return
 
     trace_id = generate_short_uuid(16)
     context = TraceContext(BACKGROUND_LOGGER, trace_id)
     context.push_span(name, {"__root__": True, **(attributes or {})})
 
-    CURRENT_TRACE_CONTEXT = context
+    CURRENT_TRACE_CONTEXT.set(context)
     return context
 
 
 async def end_trace(status: SpanStatus = SpanStatus.OK):
     global CURRENT_TRACE_CONTEXT
 
-    context = CURRENT_TRACE_CONTEXT
+    context = CURRENT_TRACE_CONTEXT.get()
     if context is None:
+        logger.debug("No trace context to end")
         return
 
     context.pop_span(status)
-    CURRENT_TRACE_CONTEXT = None
+    CURRENT_TRACE_CONTEXT.set(None)
 
 
 def severity(levelname: str) -> LogSeverity:
@@ -188,7 +191,7 @@ class TelemetryHandler(logging.Handler):
         if BACKGROUND_LOGGER is None:
             raise RuntimeError("Telemetry API not initialized")
 
-        context = CURRENT_TRACE_CONTEXT
+        context = CURRENT_TRACE_CONTEXT.get()
         if context is None:
             return
 
@@ -218,16 +221,22 @@ class SpanContextManager:
 
     def __enter__(self):
         global CURRENT_TRACE_CONTEXT
-        context = CURRENT_TRACE_CONTEXT
-        if context:
-            self.span = context.push_span(self.name, self.attributes)
+        context = CURRENT_TRACE_CONTEXT.get()
+        if not context:
+            logger.debug("No trace context to push span")
+            return self
+
+        self.span = context.push_span(self.name, self.attributes)
         return self
 
     def __exit__(self, exc_type, exc_value, traceback):
         global CURRENT_TRACE_CONTEXT
-        context = CURRENT_TRACE_CONTEXT
-        if context:
-            context.pop_span()
+        context = CURRENT_TRACE_CONTEXT.get()
+        if not context:
+            logger.debug("No trace context to pop span")
+            return
+
+        context.pop_span()
 
     def set_attribute(self, key: str, value: Any):
         if self.span:
@@ -237,16 +246,22 @@ class SpanContextManager:
 
     async def __aenter__(self):
         global CURRENT_TRACE_CONTEXT
-        context = CURRENT_TRACE_CONTEXT
-        if context:
-            self.span = context.push_span(self.name, self.attributes)
+        context = CURRENT_TRACE_CONTEXT.get()
+        if not context:
+            logger.debug("No trace context to push span")
+            return self
+
+        self.span = context.push_span(self.name, self.attributes)
         return self
 
     async def __aexit__(self, exc_type, exc_value, traceback):
         global CURRENT_TRACE_CONTEXT
-        context = CURRENT_TRACE_CONTEXT
-        if context:
-            context.pop_span()
+        context = CURRENT_TRACE_CONTEXT.get()
+        if not context:
+            logger.debug("No trace context to pop span")
+            return
+
+        context.pop_span()
 
     def __call__(self, func: Callable):
         @wraps(func)
@@ -275,7 +290,11 @@ def span(name: str, attributes: Dict[str, Any] = None):
 
 def get_current_span() -> Optional[Span]:
     global CURRENT_TRACE_CONTEXT
-    context = CURRENT_TRACE_CONTEXT
+    if CURRENT_TRACE_CONTEXT is None:
+        logger.debug("No trace context to get current span")
+        return None
+
+    context = CURRENT_TRACE_CONTEXT.get()
     if context:
         return context.get_current_span()
     return None